From 14f3b15d2175d489d7591956405d46ff9d78b00a Mon Sep 17 00:00:00 2001 From: Joseph Turian Date: Sun, 21 Jul 2013 15:59:28 -0700 Subject: [PATCH 001/106] resurrect quantile scaler --- sklearn/preprocessing.py | 1433 +++++++++++++++++++++++++++ sklearn/tests/test_preprocessing.py | 841 ++++++++++++++++ 2 files changed, 2274 insertions(+) create mode 100644 sklearn/preprocessing.py create mode 100644 sklearn/tests/test_preprocessing.py diff --git a/sklearn/preprocessing.py b/sklearn/preprocessing.py new file mode 100644 index 0000000000000..1de57535ec14f --- /dev/null +++ b/sklearn/preprocessing.py @@ -0,0 +1,1433 @@ +# Authors: Alexandre Gramfort +# Mathieu Blondel +# Olivier Grisel +# Andreas Mueller +# License: BSD 3 clause + +import warnings +import numbers + +import numpy as np +import scipy.sparse as sp + +from numpy.testing import assert_almost_equal + +from .base import BaseEstimator, TransformerMixin +from .externals.six import string_types +from .utils import check_arrays +from .utils import array2d +from .utils import atleast2d_or_csr +from .utils import atleast2d_or_csc +from .utils import safe_asarray +from .utils import warn_if_not_float +from .utils.fixes import unique + +from .utils.multiclass import unique_labels +from .utils.multiclass import is_multilabel +from .utils.multiclass import type_of_target + +from .utils.sparsefuncs import inplace_csr_row_normalize_l1 +from .utils.sparsefuncs import inplace_csr_row_normalize_l2 +from .utils.sparsefuncs import inplace_csr_column_scale +from .utils.sparsefuncs import mean_variance_axis0 +from .externals import six + +zip = six.moves.zip +map = six.moves.map + +__all__ = ['Binarizer', + 'KernelCenterer', + 'LabelBinarizer', + 'LabelEncoder', + 'MinMaxScaler', + 'Normalizer', + 'OneHotEncoder', + 'StandardScaler', + 'binarize', + 'normalize', + 'scale'] + + +def _mean_and_std(X, axis=0, with_mean=True, with_std=True): + """Compute mean and std deviation for centering, scaling. + + Zero valued std components are reset to 1.0 to avoid NaNs when scaling. + """ + X = np.asarray(X) + Xr = np.rollaxis(X, axis) + + if with_mean: + mean_ = Xr.mean(axis=0) + else: + mean_ = None + + if with_std: + std_ = Xr.std(axis=0) + if isinstance(std_, np.ndarray): + std_[std_ == 0.0] = 1.0 + elif std_ == 0.: + std_ = 1. + else: + std_ = None + + return mean_, std_ + + +def scale(X, axis=0, with_mean=True, with_std=True, copy=True): + """Standardize a dataset along any axis + + Center to the mean and component wise scale to unit variance. + + Parameters + ---------- + X : array-like or CSR matrix. + The data to center and scale. + + axis : int (0 by default) + axis used to compute the means and standard deviations along. If 0, + independently standardize each feature, otherwise (if 1) standardize + each sample. + + with_mean : boolean, True by default + If True, center the data before scaling. + + with_std : boolean, True by default + If True, scale the data to unit variance (or equivalently, + unit standard deviation). + + copy : boolean, optional, default is True + set to False to perform inplace row normalization and avoid a + copy (if the input is already a numpy array or a scipy.sparse + CSR matrix and if axis is 1). + + Notes + ----- + This implementation will refuse to center scipy.sparse matrices + since it would make them non-sparse and would potentially crash the + program with memory exhaustion problems. + + Instead the caller is expected to either set explicitly + `with_mean=False` (in that case, only variance scaling will be + performed on the features of the CSR matrix) or to call `X.toarray()` + if he/she expects the materialized dense array to fit in memory. + + To avoid memory copy the caller should pass a CSR matrix. + + See also + -------- + :class:`sklearn.preprocessing.StandardScaler` to perform centering and + scaling using the ``Transformer`` API (e.g. as part of a preprocessing + :class:`sklearn.pipeline.Pipeline`) + """ + if sp.issparse(X): + if with_mean: + raise ValueError( + "Cannot center sparse matrices: pass `with_mean=False` instead" + " See docstring for motivation and alternatives.") + if axis != 0: + raise ValueError("Can only scale sparse matrix on axis=0, " + " got axis=%d" % axis) + warn_if_not_float(X, estimator='The scale function') + if not sp.isspmatrix_csr(X): + X = X.tocsr() + copy = False + if copy: + X = X.copy() + _, var = mean_variance_axis0(X) + var[var == 0.0] = 1.0 + inplace_csr_column_scale(X, 1 / np.sqrt(var)) + else: + X = np.asarray(X) + warn_if_not_float(X, estimator='The scale function') + mean_, std_ = _mean_and_std( + X, axis, with_mean=with_mean, with_std=with_std) + if copy: + X = X.copy() + # Xr is a view on the original array that enables easy use of + # broadcasting on the axis in which we are interested in + Xr = np.rollaxis(X, axis) + if with_mean: + Xr -= mean_ + if with_std: + Xr /= std_ + return X + + +class MinMaxScaler(BaseEstimator, TransformerMixin): + """Standardizes features by scaling each feature to a given range. + + This estimator scales and translates each feature individually such + that it is in the given range on the training set, i.e. between + zero and one. + + The standardization is given by:: + X_std = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0)) + X_scaled = X_std * (max - min) + min + + where min, max = feature_range. + + This standardization is often used as an alternative to zero mean, + unit variance scaling. + + Parameters + ---------- + feature_range: tuple (min, max), default=(0, 1) + Desired range of transformed data. + + copy : boolean, optional, default is True + Set to False to perform inplace row normalization and avoid a + copy (if the input is already a numpy array). + + Attributes + ---------- + `min_` : ndarray, shape (n_features,) + Per feature adjustment for minimum. + + `scale_` : ndarray, shape (n_features,) + Per feature relative scaling of the data. + """ + + def __init__(self, feature_range=(0, 1), copy=True): + self.feature_range = feature_range + self.copy = copy + + def fit(self, X, y=None): + """Compute the minimum and maximum to be used for later scaling. + + Parameters + ---------- + X : array-like, shape [n_samples, n_features] + The data used to compute the per-feature minimum and maximum + used for later scaling along the features axis. + """ + X = check_arrays(X, sparse_format="dense", copy=self.copy)[0] + warn_if_not_float(X, estimator=self) + feature_range = self.feature_range + if feature_range[0] >= feature_range[1]: + raise ValueError("Minimum of desired feature range must be smaller" + " than maximum. Got %s." % str(feature_range)) + data_min = np.min(X, axis=0) + data_range = np.max(X, axis=0) - data_min + # Do not scale constant features + data_range[data_range == 0.0] = 1.0 + self.scale_ = (feature_range[1] - feature_range[0]) / data_range + self.min_ = feature_range[0] - data_min * self.scale_ + self.data_range = data_range + self.data_min = data_min + return self + + def transform(self, X): + """Scaling features of X according to feature_range. + + Parameters + ---------- + X : array-like with shape [n_samples, n_features] + Input data that will be transformed. + """ + X = check_arrays(X, sparse_format="dense", copy=self.copy)[0] + X *= self.scale_ + X += self.min_ + return X + + def inverse_transform(self, X): + """Undo the scaling of X according to feature_range. + + Parameters + ---------- + X : array-like with shape [n_samples, n_features] + Input data that will be transformed. + """ + X = check_arrays(X, sparse_format="dense", copy=self.copy)[0] + X -= self.min_ + X /= self.scale_ + return X + + +class StandardScaler(BaseEstimator, TransformerMixin): + """Standardize features by removing the mean and scaling to unit variance + + Centering and scaling happen independently on each feature by computing + the relevant statistics on the samples in the training set. Mean and + standard deviation are then stored to be used on later data using the + `transform` method. + + Standardization of a dataset is a common requirement for many + machine learning estimators: they might behave badly if the + individual feature do not more or less look like standard normally + distributed data (e.g. Gaussian with 0 mean and unit variance). + + For instance many elements used in the objective function of + a learning algorithm (such as the RBF kernel of Support Vector + Machines or the L1 and L2 regularizers of linear models) assume that + all features are centered around 0 and have variance in the same + order. If a feature has a variance that is orders of magnitude larger + that others, it might dominate the objective function and make the + estimator unable to learn from other features correctly as expected. + + Parameters + ---------- + with_mean : boolean, True by default + If True, center the data before scaling. + This does not work (and will raise an exception) when attempted on + sparse matrices, because centering them entails building a dense + matrix which in common use cases is likely to be too large to fit in + memory. + + with_std : boolean, True by default + If True, scale the data to unit variance (or equivalently, + unit standard deviation). + + copy : boolean, optional, default is True + If False, try to avoid a copy and do inplace scaling instead. + This is not guaranteed to always work inplace; e.g. if the data is + not a NumPy array or scipy.sparse CSR matrix, a copy may still be + returned. + + Attributes + ---------- + `mean_` : array of floats with shape [n_features] + The mean value for each feature in the training set. + + `std_` : array of floats with shape [n_features] + The standard deviation for each feature in the training set. + + See also + -------- + :func:`sklearn.preprocessing.scale` to perform centering and + scaling without using the ``Transformer`` object oriented API + + :class:`sklearn.preprocessing.RankScaler` to perform standardization + that is more robust to outliers, but slower and more memory-intensive. + + :class:`sklearn.decomposition.RandomizedPCA` with `whiten=True` + to further remove the linear correlation across features. + """ + + def __init__(self, copy=True, with_mean=True, with_std=True): + self.with_mean = with_mean + self.with_std = with_std + self.copy = copy + + def fit(self, X, y=None): + """Compute the mean and std to be used for later scaling. + + Parameters + ---------- + X : array-like or CSR matrix with shape [n_samples, n_features] + The data used to compute the mean and standard deviation + used for later scaling along the features axis. + """ + X = check_arrays(X, copy=self.copy, sparse_format="csr")[0] + if sp.issparse(X): + if self.with_mean: + raise ValueError( + "Cannot center sparse matrices: pass `with_mean=False` " + "instead. See docstring for motivation and alternatives.") + warn_if_not_float(X, estimator=self) + self.mean_ = None + + if self.with_std: + var = mean_variance_axis0(X)[1] + self.std_ = np.sqrt(var) + self.std_[var == 0.0] = 1.0 + else: + self.std_ = None + return self + else: + warn_if_not_float(X, estimator=self) + self.mean_, self.std_ = _mean_and_std( + X, axis=0, with_mean=self.with_mean, with_std=self.with_std) + return self + + def transform(self, X, y=None, copy=None): + """Perform standardization by centering and scaling + + Parameters + ---------- + X : array-like with shape [n_samples, n_features] + The data used to scale along the features axis. + """ + copy = copy if copy is not None else self.copy + X = check_arrays(X, copy=copy, sparse_format="csr")[0] + if sp.issparse(X): + if self.with_mean: + raise ValueError( + "Cannot center sparse matrices: pass `with_mean=False` " + "instead See docstring for motivation and alternatives.") + if self.std_ is not None: + warn_if_not_float(X, estimator=self) + inplace_csr_column_scale(X, 1 / self.std_) + else: + warn_if_not_float(X, estimator=self) + if self.with_mean: + X -= self.mean_ + if self.with_std: + X /= self.std_ + return X + + def inverse_transform(self, X, copy=None): + """Scale back the data to the original representation + + Parameters + ---------- + X : array-like with shape [n_samples, n_features] + The data used to scale along the features axis. + """ + copy = copy if copy is not None else self.copy + if sp.issparse(X): + if self.with_mean: + raise ValueError( + "Cannot uncenter sparse matrices: pass `with_mean=False` " + "instead See docstring for motivation and alternatives.") + if not sp.isspmatrix_csr(X): + X = X.tocsr() + copy = False + if copy: + X = X.copy() + if self.std_ is not None: + inplace_csr_column_scale(X, self.std_) + else: + X = np.asarray(X) + if copy: + X = X.copy() + if self.with_std: + X *= self.std_ + if self.with_mean: + X += self.mean_ + return X + + +class Scaler(StandardScaler): + def __init__(self, copy=True, with_mean=True, with_std=True): + warnings.warn("Scaler was renamed to StandardScaler. The old name " + " will be removed in 0.15.", DeprecationWarning) + super(Scaler, self).__init__(copy, with_mean, with_std) + + +class RankScaler(BaseEstimator, TransformerMixin): + """Rank-standardize features to a percentile, in the range [0, 1]. + + Rank-scaling happens independently on each feature, by determining + the percentile of the feature value. + A feature value that is smaller than observed during fitting + will scale to 0. + A feature value that is larger than observed during fitting + will scale to 1. + A feature value that is the median will scale to 0.5. + + Standardization of a dataset is a common requirement for many + machine learning estimators. Rank-scaling is useful when + estimators perform badly on StandardScalar features. Rank-scaling + is more robust than StandardScaler, because outliers can't have + large values post scaling. It is an empirical question whether + you want outliers to be given high importance (StandardScaler) + or not (RankScaler). + + Parameters + ---------- + n_ranks : int, 1000 by default + The number of different ranks possible. + i.e. The number of indices in the compressed ranking matrix + `sort_X_`. + This is an approximation, to save memory and transform + computation time. + e.g. if 1000, transformed values will have resolution 0.001. + If `None`, we store the full size matrix, comparable + in size to the initial fit `X`. + + Attributes + ---------- + `sort_X_` : array of ints, shape (n_samples, n_features) + The rank-index of every feature in the fit X. + + See also + -------- + :class:`sklearn.preprocessing.StandardScaler` to perform standardization + that is faster, but less robust to outliers. + """ + + def __init__(self, n_ranks=1000): + # TODO: Add min and max parameters? Default = [0, 1] + self.n_ranks = n_ranks + + def fit(self, X, y=None): + """Compute the feature ranks for later scaling. + + fit will take time O(n_features * n_samples * log(n_samples)), + because it must sort the entire matrix. + + It use memory O(n_features * n_ranks). + + Parameters + ---------- + X : array-like, shape (n_samples, n_features) + The data used to compute feature ranks. + """ + X = array2d(X) + n_samples, n_features = X.shape + full_sort_X_ = np.sort(X, axis=0) + if not self.n_ranks or self.n_ranks >= n_samples: + # Store the full matrix + self.sort_X_ = full_sort_X_ + else: + # Approximate the stored sort_X_ + self.sort_X_ = np.zeros((self.n_ranks, n_features)) + for i in range(self.n_ranks): + for j in range(n_features): + # Find the corresponding i in the original ranking + iorig = i * 1. * n_samples / self.n_ranks + ioriglo = int(iorig) + iorighi = ioriglo + 1 + + if ioriglo == n_samples: + self.sort_X_[i, j] = full_sort_X_[ioriglo, j] + else: + # And use linear interpolation to combine the + # original values. + wlo = (1 - (iorig - ioriglo)) + whi = (1 - (iorighi - iorig)) + assert wlo >= 0 and wlo <= 1 + assert whi >= 0 and whi <= 1 + assert_almost_equal(wlo+whi, 1.) + self.sort_X_[i, j] = wlo * full_sort_X_[ioriglo, j] \ + + whi * full_sort_X_[iorighi, j] + return self + + def transform(self, X): + """Perform rank-standardization. + + transform will take O(n_features * n_samples * log(n_ranks)), + where `n_fit_samples` is the number of samples used during `fit`. + + Parameters + ---------- + X : array-like, shape (n_samples, n_features) + The data used to scale along the features axis. + """ + X = array2d(X) + warn_if_not_float(X, estimator=self) + # TODO: Can add a copy parameter, and simply overwrite X if copy=False + X2 = np.zeros(X.shape) + for j in range(X.shape[1]): + lidx = np.searchsorted(self.sort_X_[:, j], X[:, j], side='left') + ridx = np.searchsorted(self.sort_X_[:, j], X[:, j], side='right') + v = 1. * (lidx + ridx) / (2 * self.sort_X_.shape[0]) + X2[:,j] = v + return X2 + + # TODO : Add inverse_transform method. + # I believe we could reuse the approximation code in `fit`. + +def normalize(X, norm='l2', axis=1, copy=True): + """Normalize a dataset along any axis + + Parameters + ---------- + X : array or scipy.sparse matrix with shape [n_samples, n_features] + The data to normalize, element by element. + scipy.sparse matrices should be in CSR format to avoid an + un-necessary copy. + + norm : 'l1' or 'l2', optional ('l2' by default) + The norm to use to normalize each non zero sample (or each non-zero + feature if axis is 0). + + axis : 0 or 1, optional (1 by default) + axis used to normalize the data along. If 1, independently normalize + each sample, otherwise (if 0) normalize each feature. + + copy : boolean, optional, default is True + set to False to perform inplace row normalization and avoid a + copy (if the input is already a numpy array or a scipy.sparse + CSR matrix and if axis is 1). + + See also + -------- + :class:`sklearn.preprocessing.Normalizer` to perform normalization + using the ``Transformer`` API (e.g. as part of a preprocessing + :class:`sklearn.pipeline.Pipeline`) + """ + if norm not in ('l1', 'l2'): + raise ValueError("'%s' is not a supported norm" % norm) + + if axis == 0: + sparse_format = 'csc' + elif axis == 1: + sparse_format = 'csr' + else: + raise ValueError("'%d' is not a supported axis" % axis) + + X = check_arrays(X, sparse_format=sparse_format, copy=copy)[0] + warn_if_not_float(X, 'The normalize function') + if axis == 0: + X = X.T + + if sp.issparse(X): + if norm == 'l1': + inplace_csr_row_normalize_l1(X) + elif norm == 'l2': + inplace_csr_row_normalize_l2(X) + else: + if norm == 'l1': + norms = np.abs(X).sum(axis=1)[:, np.newaxis] + norms[norms == 0.0] = 1.0 + elif norm == 'l2': + norms = np.sqrt(np.sum(X ** 2, axis=1))[:, np.newaxis] + norms[norms == 0.0] = 1.0 + X /= norms + + if axis == 0: + X = X.T + + return X + + +class Normalizer(BaseEstimator, TransformerMixin): + """Normalize samples individually to unit norm + + Each sample (i.e. each row of the data matrix) with at least one + non zero component is rescaled independently of other samples so + that its norm (l1 or l2) equals one. + + This transformer is able to work both with dense numpy arrays and + scipy.sparse matrix (use CSR format if you want to avoid the burden of + a copy / conversion). + + Scaling inputs to unit norms is a common operation for text + classification or clustering for instance. For instance the dot + product of two l2-normalized TF-IDF vectors is the cosine similarity + of the vectors and is the base similarity metric for the Vector + Space Model commonly used by the Information Retrieval community. + + Parameters + ---------- + norm : 'l1' or 'l2', optional ('l2' by default) + The norm to use to normalize each non zero sample. + + copy : boolean, optional, default is True + set to False to perform inplace row normalization and avoid a + copy (if the input is already a numpy array or a scipy.sparse + CSR matrix). + + Notes + ----- + This estimator is stateless (besides constructor parameters), the + fit method does nothing but is useful when used in a pipeline. + + See also + -------- + :func:`sklearn.preprocessing.normalize` equivalent function + without the object oriented API + """ + + def __init__(self, norm='l2', copy=True): + self.norm = norm + self.copy = copy + + def fit(self, X, y=None): + """Do nothing and return the estimator unchanged + + This method is just there to implement the usual API and hence + work in pipelines. + """ + atleast2d_or_csr(X) + return self + + def transform(self, X, y=None, copy=None): + """Scale each non zero row of X to unit norm + + Parameters + ---------- + X : array or scipy.sparse matrix with shape [n_samples, n_features] + The data to normalize, row by row. scipy.sparse matrices should be + in CSR format to avoid an un-necessary copy. + """ + copy = copy if copy is not None else self.copy + atleast2d_or_csr(X) + return normalize(X, norm=self.norm, axis=1, copy=copy) + + +def binarize(X, threshold=0.0, copy=True): + """Boolean thresholding of array-like or scipy.sparse matrix + + Parameters + ---------- + X : array or scipy.sparse matrix with shape [n_samples, n_features] + The data to binarize, element by element. + scipy.sparse matrices should be in CSR or CSC format to avoid an + un-necessary copy. + + threshold : float, optional (0.0 by default) + Feature values below this are replaced by 1, above it by 0. + Threshold may not be less than 0 for operations on sparse matrices. + + copy : boolean, optional, default is True + set to False to perform inplace binarization and avoid a copy + (if the input is already a numpy array or a scipy.sparse CSR / CSC + matrix and if axis is 1). + + See also + -------- + :class:`sklearn.preprocessing.Binarizer` to perform binarization + using the ``Transformer`` API (e.g. as part of a preprocessing + :class:`sklearn.pipeline.Pipeline`) + """ + sparse_format = "csr" # We force sparse format to be either csr or csc. + if hasattr(X, "format"): + if X.format in ["csr", "csc"]: + sparse_format = X.format + + X = check_arrays(X, sparse_format=sparse_format, copy=copy)[0] + if sp.issparse(X): + if threshold < 0: + raise ValueError('Cannot binarize a sparse matrix with threshold ' + '< 0') + cond = X.data > threshold + not_cond = np.logical_not(cond) + X.data[cond] = 1 + X.data[not_cond] = 0 + X.eliminate_zeros() + else: + cond = X > threshold + not_cond = np.logical_not(cond) + X[cond] = 1 + X[not_cond] = 0 + return X + + +class Binarizer(BaseEstimator, TransformerMixin): + """Binarize data (set feature values to 0 or 1) according to a threshold + + Values greater than the threshold map to 1, while values less than + or equal to the threshold map to 0. With the default threshold of 0, + only positive values map to 1. + + Binarization is a common operation on text count data where the + analyst can decide to only consider the presence or absence of a + feature rather than a quantified number of occurrences for instance. + + It can also be used as a pre-processing step for estimators that + consider boolean random variables (e.g. modelled using the Bernoulli + distribution in a Bayesian setting). + + Parameters + ---------- + threshold : float, optional (0.0 by default) + Feature values below this are replaced by 1, above it by 0. + Threshold may not be less than 0 for operations on sparse matrices. + + copy : boolean, optional, default is True + set to False to perform inplace binarization and avoid a copy (if + the input is already a numpy array or a scipy.sparse CSR matrix). + + Notes + ----- + If the input is a sparse matrix, only the non-zero values are subject + to update by the Binarizer class. + + This estimator is stateless (besides constructor parameters), the + fit method does nothing but is useful when used in a pipeline. + """ + + def __init__(self, threshold=0.0, copy=True): + self.threshold = threshold + self.copy = copy + + def fit(self, X, y=None): + """Do nothing and return the estimator unchanged + + This method is just there to implement the usual API and hence + work in pipelines. + """ + atleast2d_or_csr(X) + return self + + def transform(self, X, y=None, copy=None): + """Binarize each element of X + + Parameters + ---------- + X : array or scipy.sparse matrix with shape [n_samples, n_features] + The data to binarize, element by element. + scipy.sparse matrices should be in CSR format to avoid an + un-necessary copy. + """ + copy = copy if copy is not None else self.copy + return binarize(X, threshold=self.threshold, copy=copy) + + +def _transform_selected(X, transform, selected="all", copy=True): + """Apply a transform function to portion of selected features + + Parameters + ---------- + X : array-like or sparse matrix, shape=(n_samples, n_features) + Dense array or sparse matrix. + + transform : callable + A callable transform(X) -> X_transformed + + copy : boolean, optional + Copy X even if it could be avoided. + + selected: "all" or array of indices or mask + Specify which features to apply the transform to. + + Returns + ------- + X : array or sparse matrix, shape=(n_samples, n_features_new) + """ + if selected == "all": + return transform(X) + + X = atleast2d_or_csc(X, copy=copy) + + if len(selected) == 0: + return X + + n_features = X.shape[1] + ind = np.arange(n_features) + sel = np.zeros(n_features, dtype=bool) + sel[np.asarray(selected)] = True + not_sel = np.logical_not(sel) + n_selected = np.sum(sel) + + if n_selected == 0: + # No features selected. + return X + elif n_selected == n_features: + # All features selected. + return transform(X) + else: + X_sel = transform(X[:, ind[sel]]) + X_not_sel = X[:, ind[not_sel]] + + if sp.issparse(X_sel) or sp.issparse(X_not_sel): + return sp.hstack((X_sel, X_not_sel)) + else: + return np.hstack((X_sel, X_not_sel)) + + +class OneHotEncoder(BaseEstimator, TransformerMixin): + """Encode categorical integer features using a one-hot aka one-of-K scheme. + + The input to this transformer should be a matrix of integers, denoting + the values taken on by categorical (discrete) features. The output will be + a sparse matrix were each column corresponds to one possible value of one + feature. It is assumed that input features take on values in the range + [0, n_values). + + This encoding is needed for feeding categorical data to many scikit-learn + estimators, notably linear models and SVMs with the standard kernels. + + Parameters + ---------- + n_values : 'auto', int or array of ints + Number of values per feature. + + - 'auto' : determine value range from training data. + - int : maximum value for all features. + - array : maximum value per feature. + + categorical_features: "all" or array of indices or mask + Specify what features are treated as categorical. + + - 'all' (default): All features are treated as categorical. + - array of indices: Array of categorical feature indices. + - mask: Array of length n_features and with dtype=bool. + + Non-categorical features are always stacked to the right of the matrix. + + dtype : number type, default=np.float + Desired dtype of output. + + Attributes + ---------- + `active_features_` : array + Indices for active features, meaning values that actually occur + in the training set. Only available when n_values is ``'auto'``. + + `feature_indices_` : array of shape (n_features,) + Indices to feature ranges. + Feature ``i`` in the original data is mapped to features + from ``feature_indices_[i]`` to ``feature_indices_[i+1]`` + (and then potentially masked by `active_features_` afterwards) + + `n_values_` : array of shape (n_features,) + Maximum number of values per feature. + + Examples + -------- + Given a dataset with three features and two samples, we let the encoder + find the maximum value per feature and transform the data to a binary + one-hot encoding. + + >>> from sklearn.preprocessing import OneHotEncoder + >>> enc = OneHotEncoder() + >>> enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], \ +[1, 0, 2]]) # doctest: +ELLIPSIS + OneHotEncoder(categorical_features='all', dtype=<... 'float'>, + n_values='auto') + >>> enc.n_values_ + array([2, 3, 4]) + >>> enc.feature_indices_ + array([0, 2, 5, 9]) + >>> enc.transform([[0, 1, 1]]).toarray() + array([[ 1., 0., 0., 1., 0., 0., 1., 0., 0.]]) + + See also + -------- + sklearn.feature_extraction.DictVectorizer : performs a one-hot encoding of + dictionary items (also handles string-valued features). + sklearn.feature_extraction.FeatureHasher : performs an approximate one-hot + encoding of dictionary items or strings. + """ + def __init__(self, n_values="auto", categorical_features="all", + dtype=np.float): + self.n_values = n_values + self.categorical_features = categorical_features + self.dtype = dtype + + def fit(self, X, y=None): + """Fit OneHotEncoder to X. + + Parameters + ---------- + X : array-like, shape=(n_samples, n_feature) + Input array of type int. + + Returns + ------- + self + """ + self.fit_transform(X) + return self + + def _fit_transform(self, X): + """Assumes X contains only categorical features.""" + X = check_arrays(X, sparse_format='dense', dtype=np.int)[0] + if np.any(X < 0): + raise ValueError("X needs to contain only non-negative integers.") + n_samples, n_features = X.shape + if self.n_values == 'auto': + n_values = np.max(X, axis=0) + 1 + elif isinstance(self.n_values, numbers.Integral): + n_values = np.empty(n_features, dtype=np.int) + n_values.fill(self.n_values) + else: + try: + n_values = np.asarray(self.n_values, dtype=int) + except (ValueError, TypeError): + raise TypeError("Wrong type for parameter `n_values`. Expected" + " 'auto', int or array of ints, got %r" + % type(X)) + if n_values.ndim < 1 or n_values.shape[0] != X.shape[1]: + raise ValueError("Shape mismatch: if n_values is an array," + " it has to be of shape (n_features,).") + self.n_values_ = n_values + n_values = np.hstack([[0], n_values]) + indices = np.cumsum(n_values) + self.feature_indices_ = indices + + column_indices = (X + indices[:-1]).ravel() + row_indices = np.repeat(np.arange(n_samples, dtype=np.int32), + n_features) + data = np.ones(n_samples * n_features) + out = sp.coo_matrix((data, (row_indices, column_indices)), + shape=(n_samples, indices[-1]), + dtype=self.dtype).tocsr() + + if self.n_values == 'auto': + mask = np.array(out.sum(axis=0)).ravel() != 0 + active_features = np.where(mask)[0] + out = out[:, active_features] + self.active_features_ = active_features + + return out + + def fit_transform(self, X, y=None): + """Fit OneHotEncoder to X, then transform X. + + Equivalent to self.fit(X).transform(X), but more convenient and more + efficient. See fit for the parameters, transform for the return value. + """ + return _transform_selected(X, self._fit_transform, + self.categorical_features, copy=True) + + def _transform(self, X): + """Asssumes X contains only categorical features.""" + X = check_arrays(X, sparse_format='dense', dtype=np.int)[0] + if np.any(X < 0): + raise ValueError("X needs to contain only non-negative integers.") + n_samples, n_features = X.shape + + indices = self.feature_indices_ + if n_features != indices.shape[0] - 1: + raise ValueError("X has different shape than during fitting." + " Expected %d, got %d." + % (indices.shape[0] - 1, n_features)) + + n_values_check = np.max(X, axis=0) + 1 + if (n_values_check > self.n_values_).any(): + raise ValueError("Feature out of bounds. Try setting n_values.") + + column_indices = (X + indices[:-1]).ravel() + row_indices = np.repeat(np.arange(n_samples, dtype=np.int32), + n_features) + data = np.ones(n_samples * n_features) + out = sp.coo_matrix((data, (row_indices, column_indices)), + shape=(n_samples, indices[-1]), + dtype=self.dtype).tocsr() + if self.n_values == 'auto': + out = out[:, self.active_features_] + return out + + def transform(self, X): + """Transform X using one-hot encoding. + + Parameters + ---------- + X : array-like, shape=(n_samples, n_features) + Input array of type int. + + Returns + ------- + X_out : sparse matrix, dtype=int + Transformed input. + """ + return _transform_selected(X, self._transform, + self.categorical_features, copy=True) + + +class LabelEncoder(BaseEstimator, TransformerMixin): + """Encode labels with value between 0 and n_classes-1. + + Attributes + ---------- + `classes_`: array of shape [n_class] + Holds the label for each class. + + Examples + -------- + `LabelEncoder` can be used to normalize labels. + + >>> from sklearn import preprocessing + >>> le = preprocessing.LabelEncoder() + >>> le.fit([1, 2, 2, 6]) + LabelEncoder() + >>> le.classes_ + array([1, 2, 6]) + >>> le.transform([1, 1, 2, 6]) #doctest: +ELLIPSIS + array([0, 0, 1, 2]...) + >>> le.inverse_transform([0, 0, 1, 2]) + array([1, 1, 2, 6]) + + It can also be used to transform non-numerical labels (as long as they are + hashable and comparable) to numerical labels. + + >>> le = preprocessing.LabelEncoder() + >>> le.fit(["paris", "paris", "tokyo", "amsterdam"]) + LabelEncoder() + >>> list(le.classes_) + ['amsterdam', 'paris', 'tokyo'] + >>> le.transform(["tokyo", "tokyo", "paris"]) #doctest: +ELLIPSIS + array([2, 2, 1]...) + >>> list(le.inverse_transform([2, 2, 1])) + ['tokyo', 'tokyo', 'paris'] + + """ + + def _check_fitted(self): + if not hasattr(self, "classes_"): + raise ValueError("LabelNormalizer was not fitted yet.") + + def fit(self, y): + """Fit label encoder + + Parameters + ---------- + y : array-like of shape [n_samples] + Target values. + + Returns + ------- + self : returns an instance of self. + """ + self.classes_ = np.unique(y) + return self + + def fit_transform(self, y): + """Fit label encoder and return encoded labels + + Parameters + ---------- + y : array-like of shape [n_samples] + Target values. + + Returns + ------- + y : array-like of shape [n_samples] + """ + self.classes_, y = unique(y, return_inverse=True) + return y + + def transform(self, y): + """Transform labels to normalized encoding. + + Parameters + ---------- + y : array-like of shape [n_samples] + Target values. + + Returns + ------- + y : array-like of shape [n_samples] + """ + self._check_fitted() + + classes = np.unique(y) + if len(np.intersect1d(classes, self.classes_)) < len(classes): + diff = np.setdiff1d(classes, self.classes_) + raise ValueError("y contains new labels: %s" % str(diff)) + + return np.searchsorted(self.classes_, y) + + def inverse_transform(self, y): + """Transform labels back to original encoding. + + Parameters + ---------- + y : numpy array of shape [n_samples] + Target values. + + Returns + ------- + y : numpy array of shape [n_samples] + """ + self._check_fitted() + + y = np.asarray(y) + return self.classes_[y] + + +class LabelBinarizer(BaseEstimator, TransformerMixin): + """Binarize labels in a one-vs-all fashion + + Several regression and binary classification algorithms are + available in the scikit. A simple way to extend these algorithms + to the multi-class classification case is to use the so-called + one-vs-all scheme. + + At learning time, this simply consists in learning one regressor + or binary classifier per class. In doing so, one needs to convert + multi-class labels to binary labels (belong or does not belong + to the class). LabelBinarizer makes this process easy with the + transform method. + + At prediction time, one assigns the class for which the corresponding + model gave the greatest confidence. LabelBinarizer makes this easy + with the inverse_transform method. + + Parameters + ---------- + + neg_label: int (default: 0) + Value with which negative labels must be encoded. + + pos_label: int (default: 1) + Value with which positive labels must be encoded. + + Attributes + ---------- + `classes_`: array of shape [n_class] + Holds the label for each class. + + Examples + -------- + >>> from sklearn import preprocessing + >>> lb = preprocessing.LabelBinarizer() + >>> lb.fit([1, 2, 6, 4, 2]) + LabelBinarizer(neg_label=0, pos_label=1) + >>> lb.classes_ + array([1, 2, 4, 6]) + >>> lb.transform([1, 6]) + array([[1, 0, 0, 0], + [0, 0, 0, 1]]) + + >>> lb.fit_transform([(1, 2), (3,)]) + array([[1, 1, 0], + [0, 0, 1]]) + >>> lb.classes_ + array([1, 2, 3]) + """ + + def __init__(self, neg_label=0, pos_label=1): + if neg_label >= pos_label: + raise ValueError("neg_label must be strictly less than pos_label.") + + self.neg_label = neg_label + self.pos_label = pos_label + + def _check_fitted(self): + if not hasattr(self, "classes_"): + raise ValueError("LabelBinarizer was not fitted yet.") + + def fit(self, y): + """Fit label binarizer + + Parameters + ---------- + y : numpy array of shape [n_samples] or sequence of sequences + Target values. In the multilabel case the nested sequences can + have variable lengths. + + Returns + ------- + self : returns an instance of self. + """ + y_type = type_of_target(y) + self.multilabel = y_type.startswith('multilabel') + if self.multilabel: + self.indicator_matrix_ = y_type == 'multilabel-indicator' + + self.classes_ = unique_labels(y) + + return self + + def transform(self, y): + """Transform multi-class labels to binary labels + + The output of transform is sometimes referred to by some authors as the + 1-of-K coding scheme. + + Parameters + ---------- + y : numpy array of shape [n_samples] or sequence of sequences + Target values. In the multilabel case the nested sequences can + have variable lengths. + + Returns + ------- + Y : numpy array of shape [n_samples, n_classes] + """ + self._check_fitted() + + y_type = type_of_target(y) + + if self.multilabel or len(self.classes_) > 2: + if y_type == 'multilabel-indicator': + # nothing to do as y is already a label indicator matrix + return y + + Y = np.zeros((len(y), len(self.classes_)), dtype=np.int) + else: + Y = np.zeros((len(y), 1), dtype=np.int) + + Y += self.neg_label + + y_is_multilabel = y_type.startswith('multilabel') + + if y_is_multilabel and not self.multilabel: + raise ValueError("The object was not fitted with multilabel" + " input!") + + elif self.multilabel: + if not y_is_multilabel: + raise ValueError("y should be a list of label lists/tuples," + "got %r" % (y,)) + + # inverse map: label => column index + imap = dict((v, k) for k, v in enumerate(self.classes_)) + + for i, label_tuple in enumerate(y): + for label in label_tuple: + Y[i, imap[label]] = self.pos_label + + return Y + + else: + y = np.asarray(y) + + if len(self.classes_) == 2: + Y[y == self.classes_[1], 0] = self.pos_label + return Y + + elif len(self.classes_) >= 2: + for i, k in enumerate(self.classes_): + Y[y == k, i] = self.pos_label + return Y + + else: + # Only one class, returns a matrix with all negative labels. + return Y + + def inverse_transform(self, Y, threshold=None): + """Transform binary labels back to multi-class labels + + Parameters + ---------- + Y : numpy array of shape [n_samples, n_classes] + Target values. + + threshold : float or None + Threshold used in the binary and multi-label cases. + + Use 0 when: + - Y contains the output of decision_function (classifier) + Use 0.5 when: + - Y contains the output of predict_proba + + If None, the threshold is assumed to be half way between + neg_label and pos_label. + + Returns + ------- + y : numpy array of shape [n_samples] or sequence of sequences + Target values. In the multilabel case the nested sequences can + have variable lengths. + + Notes + ----- + In the case when the binary labels are fractional + (probabilistic), inverse_transform chooses the class with the + greatest value. Typically, this allows to use the output of a + linear model's decision_function method directly as the input + of inverse_transform. + """ + self._check_fitted() + + if threshold is None: + half = (self.pos_label - self.neg_label) / 2.0 + threshold = self.neg_label + half + + if self.multilabel: + Y = np.array(Y > threshold, dtype=int) + # Return the predictions in the same format as in fit + if self.indicator_matrix_: + # Label indicator matrix format + return Y + else: + # Lists of tuples format + return [tuple(self.classes_[np.flatnonzero(Y[i])]) + for i in range(Y.shape[0])] + + if len(Y.shape) == 1 or Y.shape[1] == 1: + y = np.array(Y.ravel() > threshold, dtype=int) + + else: + y = Y.argmax(axis=1) + + return self.classes_[y] + + +class KernelCenterer(BaseEstimator, TransformerMixin): + """Center a kernel matrix + + Let K(x_i, x_j) be a kernel defined by K(x_i, x_j) = phi(x_i)^T phi(x_j), + where phi(x) is a function mapping x to a hilbert space. KernelCenterer is + a class to center (i.e., normalize to have zero-mean) the data without + explicitly computing phi(x). It is equivalent equivalent to centering + phi(x) with sklearn.preprocessing.StandardScaler(with_std=False). + """ + + def fit(self, K, y=None): + """Fit KernelCenterer + + Parameters + ---------- + K : numpy array of shape [n_samples, n_samples] + Kernel matrix. + + Returns + ------- + self : returns an instance of self. + """ + K = array2d(K) + n_samples = K.shape[0] + self.K_fit_rows_ = np.sum(K, axis=0) / n_samples + self.K_fit_all_ = self.K_fit_rows_.sum() / n_samples + return self + + def transform(self, K, y=None, copy=True): + """Center kernel + + Parameters + ---------- + K : numpy array of shape [n_samples1, n_samples2] + Kernel matrix. + + Returns + ------- + K_new : numpy array of shape [n_samples1, n_samples2] + """ + K = array2d(K) + if copy: + K = K.copy() + + K_pred_cols = (np.sum(K, axis=1) / + self.K_fit_rows_.shape[0])[:, np.newaxis] + + K -= self.K_fit_rows_ + K -= K_pred_cols + K += self.K_fit_all_ + + return K + + +def add_dummy_feature(X, value=1.0): + """Augment dataset with an additional dummy feature. + + This is useful for fitting an intercept term with implementations which + cannot otherwise fit it directly. + + Parameters + ---------- + X : array or scipy.sparse matrix with shape [n_samples, n_features] + Data. + + value : float + Value to use for the dummy feature. + + Returns + ------- + + X : array or scipy.sparse matrix with shape [n_samples, n_features + 1] + Same data with dummy feature added as first column. + + Examples + -------- + + >>> from sklearn.preprocessing import add_dummy_feature + >>> add_dummy_feature([[0, 1], [1, 0]]) + array([[ 1., 0., 1.], + [ 1., 1., 0.]]) + """ + X = safe_asarray(X) + n_samples, n_features = X.shape + shape = (n_samples, n_features + 1) + if sp.issparse(X): + if sp.isspmatrix_coo(X): + # Shift columns to the right. + col = X.col + 1 + # Column indices of dummy feature are 0 everywhere. + col = np.concatenate((np.zeros(n_samples), col)) + # Row indices of dummy feature are 0, ..., n_samples-1. + row = np.concatenate((np.arange(n_samples), X.row)) + # Prepend the dummy feature n_samples times. + data = np.concatenate((np.ones(n_samples) * value, X.data)) + return sp.coo_matrix((data, (row, col)), shape) + elif sp.isspmatrix_csc(X): + # Shift index pointers since we need to add n_samples elements. + indptr = X.indptr + n_samples + # indptr[0] must be 0. + indptr = np.concatenate((np.array([0]), indptr)) + # Row indices of dummy feature are 0, ..., n_samples-1. + indices = np.concatenate((np.arange(n_samples), X.indices)) + # Prepend the dummy feature n_samples times. + data = np.concatenate((np.ones(n_samples) * value, X.data)) + return sp.csc_matrix((data, indices, indptr), shape) + else: + klass = X.__class__ + return klass(add_dummy_feature(X.tocoo(), value)) + else: + return np.hstack((np.ones((n_samples, 1)) * value, X)) diff --git a/sklearn/tests/test_preprocessing.py b/sklearn/tests/test_preprocessing.py new file mode 100644 index 0000000000000..6c87243d1be88 --- /dev/null +++ b/sklearn/tests/test_preprocessing.py @@ -0,0 +1,841 @@ +import warnings +import numpy as np +import numpy.linalg as la +import scipy.sparse as sp + +from sklearn.utils.testing import assert_almost_equal +from sklearn.utils.testing import assert_array_almost_equal +from sklearn.utils.testing import assert_array_equal +from sklearn.utils.testing import assert_equal +from sklearn.utils.testing import assert_raises +from sklearn.utils.testing import assert_true +from sklearn.utils.testing import assert_false + +from sklearn.utils.sparsefuncs import mean_variance_axis0 +from sklearn.preprocessing import Binarizer +from sklearn.preprocessing import KernelCenterer +from sklearn.preprocessing import LabelBinarizer +from sklearn.preprocessing import _transform_selected +from sklearn.preprocessing import OneHotEncoder +from sklearn.preprocessing import LabelEncoder +from sklearn.preprocessing import Normalizer +from sklearn.preprocessing import normalize +from sklearn.preprocessing import StandardScaler +from sklearn.preprocessing import scale +from sklearn.preprocessing import MinMaxScaler +from sklearn.preprocessing import RankScaler +from sklearn.preprocessing import add_dummy_feature + +from sklearn import datasets +from sklearn.linear_model.stochastic_gradient import SGDClassifier + +iris = datasets.load_iris() + + +def toarray(a): + if hasattr(a, "toarray"): + a = a.toarray() + return a + + +def test_scaler_1d(): + """Test scaling of dataset along single axis""" + rng = np.random.RandomState(0) + X = rng.randn(5) + X_orig_copy = X.copy() + + scaler = StandardScaler() + X_scaled = scaler.fit(X).transform(X, copy=False) + assert_array_almost_equal(X_scaled.mean(axis=0), 0.0) + assert_array_almost_equal(X_scaled.std(axis=0), 1.0) + + # check inverse transform + X_scaled_back = scaler.inverse_transform(X_scaled) + assert_array_almost_equal(X_scaled_back, X_orig_copy) + + # Test with 1D list + X = [0., 1., 2, 0.4, 1.] + scaler = StandardScaler() + X_scaled = scaler.fit(X).transform(X, copy=False) + assert_array_almost_equal(X_scaled.mean(axis=0), 0.0) + assert_array_almost_equal(X_scaled.std(axis=0), 1.0) + + X_scaled = scale(X) + assert_array_almost_equal(X_scaled.mean(axis=0), 0.0) + assert_array_almost_equal(X_scaled.std(axis=0), 1.0) + +# rank_scaler = RankScaler() +# X_rank_scaled = rank_scaler.fit(X).transform(X) + + +def test_scaler_2d_arrays(): + """Test scaling of 2d array along first axis""" + rng = np.random.RandomState(0) + X = rng.randn(4, 5) + X[:, 0] = 0.0 # first feature is always of zero + + scaler = StandardScaler() + X_scaled = scaler.fit(X).transform(X, copy=True) + assert_false(np.any(np.isnan(X_scaled))) + + assert_array_almost_equal(X_scaled.mean(axis=0), 5 * [0.0]) + assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.]) + # Check that X has been copied + assert_true(X_scaled is not X) + + # check inverse transform + X_scaled_back = scaler.inverse_transform(X_scaled) + assert_true(X_scaled_back is not X) + assert_true(X_scaled_back is not X_scaled) + assert_array_almost_equal(X_scaled_back, X) + + X_scaled = scale(X, axis=1, with_std=False) + assert_false(np.any(np.isnan(X_scaled))) + assert_array_almost_equal(X_scaled.mean(axis=1), 4 * [0.0]) + X_scaled = scale(X, axis=1, with_std=True) + assert_false(np.any(np.isnan(X_scaled))) + assert_array_almost_equal(X_scaled.mean(axis=1), 4 * [0.0]) + assert_array_almost_equal(X_scaled.std(axis=1), 4 * [1.0]) + # Check that the data hasn't been modified + assert_true(X_scaled is not X) + + X_scaled = scaler.fit(X).transform(X, copy=False) + assert_false(np.any(np.isnan(X_scaled))) + assert_array_almost_equal(X_scaled.mean(axis=0), 5 * [0.0]) + assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.]) + # Check that X has not been copied + assert_true(X_scaled is X) + + X = rng.randn(4, 5) + X[:, 0] = 1.0 # first feature is a constant, non zero feature + scaler = StandardScaler() + X_scaled = scaler.fit(X).transform(X, copy=True) + assert_false(np.any(np.isnan(X_scaled))) + assert_array_almost_equal(X_scaled.mean(axis=0), 5 * [0.0]) + assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.]) + # Check that X has not been copied + assert_true(X_scaled is not X) + + X = np.array([[1, 0, 0, 0, 1], + [2, 1, 4, 1, 1], + [3, 2, 3, 1, 0], + [3, 0, 0, 4, 1]]) + + rank_scaler = RankScaler() + rank_scaler.fit(X) + X_scaled = rank_scaler.transform(X) + assert_array_almost_equal(X_scaled, [[0.125, 0.25, 0.25, 0.125, 0.625], + [0.375, 0.625, 0.875, 0.5, 0.625], + [0.75, 0.875, 0.625, 0.5, 0.125], + [0.75, 0.25, 0.25, 0.875, 0.625]]) + + X2 = np.array([[0, 1.5, 0, 5, 10]]) + X2_scaled = rank_scaler.transform(X2) + assert_array_almost_equal(X2_scaled, [[0., 0.75, 0.25, 1., 1.]]) + + # Check RankScaler at different n_ranks + n_features = 100 + for n_samples in [10, 100, 1000]: + for n_ranks in [n_samples + 1, n_samples, n_samples - 1, + int(n_samples / 2), int(n_samples / 7), int(n_samples / 10)]: + X = rng.randn(n_samples, n_features) + rank_scaler1 = RankScaler(n_ranks=None) + rank_scaler2 = RankScaler(n_ranks=n_ranks) + rank_scaler1.fit(X) + rank_scaler2.fit(X) + + X2 = rng.randn(1000, n_features) + X21 = rank_scaler1.transform(X2) + X22 = rank_scaler2.transform(X2) + + # In the approximate version X22, all values must + # be within 1./n_ranks of the exact value X11. + assert_true(np.all(np.fabs(X21 - X22) < 1. / n_ranks)) + + +def test_min_max_scaler_iris(): + X = iris.data + scaler = MinMaxScaler() + # default params + X_trans = scaler.fit_transform(X) + assert_array_almost_equal(X_trans.min(axis=0), 0) + assert_array_almost_equal(X_trans.min(axis=0), 0) + assert_array_almost_equal(X_trans.max(axis=0), 1) + X_trans_inv = scaler.inverse_transform(X_trans) + assert_array_almost_equal(X, X_trans_inv) + + # not default params: min=1, max=2 + scaler = MinMaxScaler(feature_range=(1, 2)) + X_trans = scaler.fit_transform(X) + assert_array_almost_equal(X_trans.min(axis=0), 1) + assert_array_almost_equal(X_trans.max(axis=0), 2) + X_trans_inv = scaler.inverse_transform(X_trans) + assert_array_almost_equal(X, X_trans_inv) + + # min=-.5, max=.6 + scaler = MinMaxScaler(feature_range=(-.5, .6)) + X_trans = scaler.fit_transform(X) + assert_array_almost_equal(X_trans.min(axis=0), -.5) + assert_array_almost_equal(X_trans.max(axis=0), .6) + X_trans_inv = scaler.inverse_transform(X_trans) + assert_array_almost_equal(X, X_trans_inv) + + # raises on invalid range + scaler = MinMaxScaler(feature_range=(2, 1)) + assert_raises(ValueError, scaler.fit, X) + + +def test_min_max_scaler_zero_variance_features(): + """Check min max scaler on toy data with zero variance features""" + X = [[0., 1., 0.5], + [0., 1., -0.1], + [0., 1., 1.1]] + + X_new = [[+0., 2., 0.5], + [-1., 1., 0.0], + [+0., 1., 1.5]] + + # default params + scaler = MinMaxScaler() + X_trans = scaler.fit_transform(X) + X_expected_0_1 = [[0., 0., 0.5], + [0., 0., 0.0], + [0., 0., 1.0]] + assert_array_almost_equal(X_trans, X_expected_0_1) + X_trans_inv = scaler.inverse_transform(X_trans) + assert_array_almost_equal(X, X_trans_inv) + + X_trans_new = scaler.transform(X_new) + X_expected_0_1_new = [[+0., 1., 0.500], + [-1., 0., 0.083], + [+0., 0., 1.333]] + assert_array_almost_equal(X_trans_new, X_expected_0_1_new, decimal=2) + + # not default params + scaler = MinMaxScaler(feature_range=(1, 2)) + X_trans = scaler.fit_transform(X) + X_expected_1_2 = [[1., 1., 1.5], + [1., 1., 1.0], + [1., 1., 2.0]] + assert_array_almost_equal(X_trans, X_expected_1_2) + + +def test_scaler_without_centering(): + rng = np.random.RandomState(42) + X = rng.randn(4, 5) + X[:, 0] = 0.0 # first feature is always of zero + X_csr = sp.csr_matrix(X) + X_csc = sp.csc_matrix(X) + + assert_raises(ValueError, StandardScaler().fit, X_csr) + + null_transform = StandardScaler(with_mean=False, with_std=False, copy=True) + X_null = null_transform.fit_transform(X_csr) + assert_array_equal(X_null.data, X_csr.data) + X_orig = null_transform.inverse_transform(X_null) + assert_array_equal(X_orig.data, X_csr.data) + + scaler = StandardScaler(with_mean=False).fit(X) + X_scaled = scaler.transform(X, copy=True) + assert_false(np.any(np.isnan(X_scaled))) + + scaler_csr = StandardScaler(with_mean=False).fit(X_csr) + X_csr_scaled = scaler_csr.transform(X_csr, copy=True) + assert_false(np.any(np.isnan(X_csr_scaled.data))) + + scaler_csc = StandardScaler(with_mean=False).fit(X_csc) + X_csc_scaled = scaler_csr.transform(X_csc, copy=True) + assert_false(np.any(np.isnan(X_csc_scaled.data))) + + assert_equal(scaler.mean_, scaler_csr.mean_) + assert_array_almost_equal(scaler.std_, scaler_csr.std_) + + assert_equal(scaler.mean_, scaler_csc.mean_) + assert_array_almost_equal(scaler.std_, scaler_csc.std_) + + assert_array_almost_equal( + X_scaled.mean(axis=0), [0., -0.01, 2.24, -0.35, -0.78], 2) + assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.]) + + X_csr_scaled_mean, X_csr_scaled_std = mean_variance_axis0(X_csr_scaled) + assert_array_almost_equal(X_csr_scaled_mean, X_scaled.mean(axis=0)) + assert_array_almost_equal(X_csr_scaled_std, X_scaled.std(axis=0)) + + # Check that X has not been modified (copy) + assert_true(X_scaled is not X) + assert_true(X_csr_scaled is not X_csr) + + X_scaled_back = scaler.inverse_transform(X_scaled) + assert_true(X_scaled_back is not X) + assert_true(X_scaled_back is not X_scaled) + assert_array_almost_equal(X_scaled_back, X) + + X_csr_scaled_back = scaler_csr.inverse_transform(X_csr_scaled) + assert_true(X_csr_scaled_back is not X_csr) + assert_true(X_csr_scaled_back is not X_csr_scaled) + assert_array_almost_equal(X_csr_scaled_back.toarray(), X) + + X_csc_scaled_back = scaler_csr.inverse_transform(X_csc_scaled.tocsc()) + assert_true(X_csc_scaled_back is not X_csc) + assert_true(X_csc_scaled_back is not X_csc_scaled) + assert_array_almost_equal(X_csc_scaled_back.toarray(), X) + + +def test_scaler_without_copy(): + """Check that StandardScaler.fit does not change input""" + rng = np.random.RandomState(42) + X = rng.randn(4, 5) + X[:, 0] = 0.0 # first feature is always of zero + X_csr = sp.csr_matrix(X) + + X_copy = X.copy() + StandardScaler(copy=False).fit(X) + assert_array_equal(X, X_copy) + + X_csr_copy = X_csr.copy() + StandardScaler(with_mean=False, copy=False).fit(X_csr) + assert_array_equal(X_csr.toarray(), X_csr_copy.toarray()) + + +def test_scale_sparse_with_mean_raise_exception(): + rng = np.random.RandomState(42) + X = rng.randn(4, 5) + X_csr = sp.csr_matrix(X) + + # check scaling and fit with direct calls on sparse data + assert_raises(ValueError, scale, X_csr, with_mean=True) + assert_raises(ValueError, StandardScaler(with_mean=True).fit, X_csr) + + # check transform and inverse_transform after a fit on a dense array + scaler = StandardScaler(with_mean=True).fit(X) + assert_raises(ValueError, scaler.transform, X_csr) + + X_transformed_csr = sp.csr_matrix(scaler.transform(X)) + assert_raises(ValueError, scaler.inverse_transform, X_transformed_csr) + + +def test_scale_function_without_centering(): + rng = np.random.RandomState(42) + X = rng.randn(4, 5) + X[:, 0] = 0.0 # first feature is always of zero + X_csr = sp.csr_matrix(X) + + X_scaled = scale(X, with_mean=False) + assert_false(np.any(np.isnan(X_scaled))) + + X_csr_scaled = scale(X_csr, with_mean=False) + assert_false(np.any(np.isnan(X_csr_scaled.data))) + + # test csc has same outcome + X_csc_scaled = scale(X_csr.tocsc(), with_mean=False) + assert_array_almost_equal(X_scaled, X_csc_scaled.toarray()) + + # raises value error on axis != 0 + assert_raises(ValueError, scale, X_csr, with_mean=False, axis=1) + + assert_array_almost_equal(X_scaled.mean(axis=0), + [0., -0.01, 2.24, -0.35, -0.78], 2) + assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.]) + # Check that X has not been copied + assert_true(X_scaled is not X) + + X_csr_scaled_mean, X_csr_scaled_std = mean_variance_axis0(X_csr_scaled) + assert_array_almost_equal(X_csr_scaled_mean, X_scaled.mean(axis=0)) + assert_array_almost_equal(X_csr_scaled_std, X_scaled.std(axis=0)) + + +def test_warning_scaling_integers(): + """Check warning when scaling integer data""" + X = np.array([[1, 2, 0], + [0, 0, 0]], dtype=np.uint8) + + with warnings.catch_warnings(record=True) as w: + StandardScaler().fit(X) + assert_equal(len(w), 1) + + with warnings.catch_warnings(record=True) as w: + MinMaxScaler().fit(X) + assert_equal(len(w), 1) + + +def test_normalizer_l1(): + rng = np.random.RandomState(0) + X_dense = rng.randn(4, 5) + X_sparse_unpruned = sp.csr_matrix(X_dense) + + # set the row number 3 to zero + X_dense[3, :] = 0.0 + + # set the row number 3 to zero without pruning (can happen in real life) + indptr_3 = X_sparse_unpruned.indptr[3] + indptr_4 = X_sparse_unpruned.indptr[4] + X_sparse_unpruned.data[indptr_3:indptr_4] = 0.0 + + # build the pruned variant using the regular constructor + X_sparse_pruned = sp.csr_matrix(X_dense) + + # check inputs that support the no-copy optim + for X in (X_dense, X_sparse_pruned, X_sparse_unpruned): + + normalizer = Normalizer(norm='l1', copy=True) + X_norm = normalizer.transform(X) + assert_true(X_norm is not X) + X_norm1 = toarray(X_norm) + + normalizer = Normalizer(norm='l1', copy=False) + X_norm = normalizer.transform(X) + assert_true(X_norm is X) + X_norm2 = toarray(X_norm) + + for X_norm in (X_norm1, X_norm2): + row_sums = np.abs(X_norm).sum(axis=1) + for i in range(3): + assert_almost_equal(row_sums[i], 1.0) + assert_almost_equal(row_sums[3], 0.0) + + # check input for which copy=False won't prevent a copy + for init in (sp.coo_matrix, sp.csc_matrix, sp.lil_matrix): + X = init(X_dense) + X_norm = normalizer = Normalizer(norm='l2', copy=False).transform(X) + + assert_true(X_norm is not X) + assert_true(isinstance(X_norm, sp.csr_matrix)) + + X_norm = toarray(X_norm) + for i in range(3): + assert_almost_equal(row_sums[i], 1.0) + assert_almost_equal(la.norm(X_norm[3]), 0.0) + + +def test_normalizer_l2(): + rng = np.random.RandomState(0) + X_dense = rng.randn(4, 5) + X_sparse_unpruned = sp.csr_matrix(X_dense) + + # set the row number 3 to zero + X_dense[3, :] = 0.0 + + # set the row number 3 to zero without pruning (can happen in real life) + indptr_3 = X_sparse_unpruned.indptr[3] + indptr_4 = X_sparse_unpruned.indptr[4] + X_sparse_unpruned.data[indptr_3:indptr_4] = 0.0 + + # build the pruned variant using the regular constructor + X_sparse_pruned = sp.csr_matrix(X_dense) + + # check inputs that support the no-copy optim + for X in (X_dense, X_sparse_pruned, X_sparse_unpruned): + + normalizer = Normalizer(norm='l2', copy=True) + X_norm1 = normalizer.transform(X) + assert_true(X_norm1 is not X) + X_norm1 = toarray(X_norm1) + + normalizer = Normalizer(norm='l2', copy=False) + X_norm2 = normalizer.transform(X) + assert_true(X_norm2 is X) + X_norm2 = toarray(X_norm2) + + for X_norm in (X_norm1, X_norm2): + for i in range(3): + assert_almost_equal(la.norm(X_norm[i]), 1.0) + assert_almost_equal(la.norm(X_norm[3]), 0.0) + + # check input for which copy=False won't prevent a copy + for init in (sp.coo_matrix, sp.csc_matrix, sp.lil_matrix): + X = init(X_dense) + X_norm = normalizer = Normalizer(norm='l2', copy=False).transform(X) + + assert_true(X_norm is not X) + assert_true(isinstance(X_norm, sp.csr_matrix)) + + X_norm = toarray(X_norm) + for i in range(3): + assert_almost_equal(la.norm(X_norm[i]), 1.0) + assert_almost_equal(la.norm(X_norm[3]), 0.0) + + +def test_normalize_errors(): + """Check that invalid arguments yield ValueError""" + assert_raises(ValueError, normalize, [[0]], axis=2) + assert_raises(ValueError, normalize, [[0]], norm='l3') + + +def test_binarizer(): + X_ = np.array([[1, 0, 5], [2, 3, -1]]) + + for init in (np.array, list, sp.csr_matrix, sp.csc_matrix): + + X = init(X_.copy()) + + binarizer = Binarizer(threshold=2.0, copy=True) + X_bin = toarray(binarizer.transform(X)) + assert_equal(np.sum(X_bin == 0), 4) + assert_equal(np.sum(X_bin == 1), 2) + X_bin = binarizer.transform(X) + assert_equal(sp.issparse(X), sp.issparse(X_bin)) + + binarizer = Binarizer(copy=True).fit(X) + X_bin = toarray(binarizer.transform(X)) + assert_true(X_bin is not X) + assert_equal(np.sum(X_bin == 0), 2) + assert_equal(np.sum(X_bin == 1), 4) + + binarizer = Binarizer(copy=True) + X_bin = binarizer.transform(X) + assert_true(X_bin is not X) + X_bin = toarray(X_bin) + assert_equal(np.sum(X_bin == 0), 2) + assert_equal(np.sum(X_bin == 1), 4) + + binarizer = Binarizer(copy=False) + X_bin = binarizer.transform(X) + if init is not list: + assert_true(X_bin is X) + X_bin = toarray(X_bin) + assert_equal(np.sum(X_bin == 0), 2) + assert_equal(np.sum(X_bin == 1), 4) + + binarizer = Binarizer(threshold=-0.5, copy=True) + for init in (np.array, list): + X = init(X_.copy()) + + X_bin = toarray(binarizer.transform(X)) + assert_equal(np.sum(X_bin == 0), 1) + assert_equal(np.sum(X_bin == 1), 5) + X_bin = binarizer.transform(X) + + # Cannot use threshold < 0 for sparse + assert_raises(ValueError, binarizer.transform, sp.csc_matrix(X)) + + +def test_label_binarizer(): + lb = LabelBinarizer() + + # two-class case + inp = ["neg", "pos", "pos", "neg"] + expected = np.array([[0, 1, 1, 0]]).T + got = lb.fit_transform(inp) + assert_array_equal(expected, got) + assert_array_equal(lb.inverse_transform(got), inp) + + # multi-class case + inp = ["spam", "ham", "eggs", "ham", "0"] + expected = np.array([[0, 0, 0, 1], + [0, 0, 1, 0], + [0, 1, 0, 0], + [0, 0, 1, 0], + [1, 0, 0, 0]]) + got = lb.fit_transform(inp) + assert_array_equal(expected, got) + assert_array_equal(lb.inverse_transform(got), inp) + + +def test_label_binarizer_set_label_encoding(): + lb = LabelBinarizer(neg_label=-2, pos_label=2) + + # two-class case + inp = np.array([0, 1, 1, 0]) + expected = np.array([[-2, 2, 2, -2]]).T + got = lb.fit_transform(inp) + assert_array_equal(expected, got) + assert_array_equal(lb.inverse_transform(got), inp) + + # multi-class case + inp = np.array([3, 2, 1, 2, 0]) + expected = np.array([[-2, -2, -2, +2], + [-2, -2, +2, -2], + [-2, +2, -2, -2], + [-2, -2, +2, -2], + [+2, -2, -2, -2]]) + got = lb.fit_transform(inp) + assert_array_equal(expected, got) + assert_array_equal(lb.inverse_transform(got), inp) + + +def test_label_binarizer_multilabel(): + lb = LabelBinarizer() + + # test input as lists of tuples + inp = [(2, 3), (1,), (1, 2)] + indicator_mat = np.array([[0, 1, 1], + [1, 0, 0], + [1, 1, 0]]) + got = lb.fit_transform(inp) + assert_array_equal(indicator_mat, got) + assert_equal(lb.inverse_transform(got), inp) + + # test input as label indicator matrix + lb.fit(indicator_mat) + assert_array_equal(indicator_mat, + lb.inverse_transform(indicator_mat)) + + # regression test for the two-class multilabel case + lb = LabelBinarizer() + + inp = [[1, 0], [0], [1], [0, 1]] + expected = np.array([[1, 1], + [1, 0], + [0, 1], + [1, 1]]) + got = lb.fit_transform(inp) + assert_array_equal(expected, got) + assert_equal([set(x) for x in lb.inverse_transform(got)], + [set(x) for x in inp]) + + +def test_label_binarizer_errors(): + """Check that invalid arguments yield ValueError""" + one_class = np.array([0, 0, 0, 0]) + lb = LabelBinarizer().fit(one_class) + + multi_label = [(2, 3), (0,), (0, 2)] + assert_raises(ValueError, lb.transform, multi_label) + + lb = LabelBinarizer() + assert_raises(ValueError, lb.transform, []) + assert_raises(ValueError, lb.inverse_transform, []) + + assert_raises(ValueError, LabelBinarizer, neg_label=2, pos_label=1) + assert_raises(ValueError, LabelBinarizer, neg_label=2, pos_label=2) + + +def test_one_hot_encoder(): + """Test OneHotEncoder's fit and transform.""" + X = [[3, 2, 1], [0, 1, 1]] + enc = OneHotEncoder() + # discover max values automatically + X_trans = enc.fit_transform(X).toarray() + assert_equal(X_trans.shape, (2, 5)) + assert_array_equal(enc.active_features_, + np.where([1, 0, 0, 1, 0, 1, 1, 0, 1])[0]) + assert_array_equal(enc.feature_indices_, [0, 4, 7, 9]) + + # check outcome + assert_array_equal(X_trans, + [[0., 1., 0., 1., 1.], + [1., 0., 1., 0., 1.]]) + + # max value given as 3 + enc = OneHotEncoder(n_values=4) + X_trans = enc.fit_transform(X) + assert_equal(X_trans.shape, (2, 4 * 3)) + assert_array_equal(enc.feature_indices_, [0, 4, 8, 12]) + + # max value given per feature + enc = OneHotEncoder(n_values=[3, 2, 2]) + X = [[1, 0, 1], [0, 1, 1]] + X_trans = enc.fit_transform(X) + assert_equal(X_trans.shape, (2, 3 + 2 + 2)) + assert_array_equal(enc.n_values_, [3, 2, 2]) + # check that testing with larger feature works: + X = np.array([[2, 0, 1], [0, 1, 1]]) + enc.transform(X) + + # test that an error is raise when out of bounds: + X_too_large = [[0, 2, 1], [0, 1, 1]] + assert_raises(ValueError, enc.transform, X_too_large) + + # test that error is raised when wrong number of features + assert_raises(ValueError, enc.transform, X[:, :-1]) + # test that error is raised when wrong number of features in fit + # with prespecified n_values + assert_raises(ValueError, enc.fit, X[:, :-1]) + # test exception on wrong init param + assert_raises(TypeError, OneHotEncoder(n_values=np.int).fit, X) + + enc = OneHotEncoder() + # test negative input to fit + assert_raises(ValueError, enc.fit, [[0], [-1]]) + + # test negative input to transform + enc.fit([[0], [1]]) + assert_raises(ValueError, enc.transform, [[0], [-1]]) + + +def _check_transform_selected(X, X_expected, sel): + for M in (X, sp.csr_matrix(X)): + Xtr = _transform_selected(M, Binarizer().transform, sel) + assert_array_equal(toarray(Xtr), X_expected) + + +def test_transform_selected(): + X = [[3, 2, 1], [0, 1, 1]] + + X_expected = [[1, 2, 1], [0, 1, 1]] + _check_transform_selected(X, X_expected, [0]) + _check_transform_selected(X, X_expected, [True, False, False]) + + X_expected = [[1, 1, 1], [0, 1, 1]] + _check_transform_selected(X, X_expected, [0, 1, 2]) + _check_transform_selected(X, X_expected, [True, True, True]) + _check_transform_selected(X, X_expected, "all") + + _check_transform_selected(X, X, []) + _check_transform_selected(X, X, [False, False, False]) + + +def _run_one_hot(X, X2, cat): + enc = OneHotEncoder(categorical_features=cat) + Xtr = enc.fit_transform(X) + X2tr = enc.transform(X2) + return Xtr, X2tr + + +def _check_one_hot(X, X2, cat, n_features): + ind = np.where(cat)[0] + # With mask + A, B = _run_one_hot(X, X2, cat) + # With indices + C, D = _run_one_hot(X, X2, ind) + # Check shape + assert_equal(A.shape, (2, n_features)) + assert_equal(B.shape, (1, n_features)) + assert_equal(C.shape, (2, n_features)) + assert_equal(D.shape, (1, n_features)) + # Check that mask and indices give the same results + assert_array_equal(toarray(A), toarray(C)) + assert_array_equal(toarray(B), toarray(D)) + + +def test_one_hot_encoder_categorical_features(): + X = np.array([[3, 2, 1], [0, 1, 1]]) + X2 = np.array([[1, 1, 1]]) + + cat = [True, False, False] + _check_one_hot(X, X2, cat, 4) + + # Edge case: all non-categorical + cat = [False, False, False] + _check_one_hot(X, X2, cat, 3) + + # Edge case: all categorical + cat = [True, True, True] + _check_one_hot(X, X2, cat, 5) + + +def test_label_encoder(): + """Test LabelEncoder's transform and inverse_transform methods""" + le = LabelEncoder() + le.fit([1, 1, 4, 5, -1, 0]) + assert_array_equal(le.classes_, [-1, 0, 1, 4, 5]) + assert_array_equal(le.transform([0, 1, 4, 4, 5, -1, -1]), + [1, 2, 3, 3, 4, 0, 0]) + assert_array_equal(le.inverse_transform([1, 2, 3, 3, 4, 0, 0]), + [0, 1, 4, 4, 5, -1, -1]) + assert_raises(ValueError, le.transform, [0, 6]) + + +def test_label_encoder_fit_transform(): + """Test fit_transform""" + le = LabelEncoder() + ret = le.fit_transform([1, 1, 4, 5, -1, 0]) + assert_array_equal(ret, [2, 2, 3, 4, 0, 1]) + + le = LabelEncoder() + ret = le.fit_transform(["paris", "paris", "tokyo", "amsterdam"]) + assert_array_equal(ret, [1, 1, 2, 0]) + + +def test_label_encoder_string_labels(): + """Test LabelEncoder's transform and inverse_transform methods with + non-numeric labels""" + le = LabelEncoder() + le.fit(["paris", "paris", "tokyo", "amsterdam"]) + assert_array_equal(le.classes_, ["amsterdam", "paris", "tokyo"]) + assert_array_equal(le.transform(["tokyo", "tokyo", "paris"]), + [2, 2, 1]) + assert_array_equal(le.inverse_transform([2, 2, 1]), + ["tokyo", "tokyo", "paris"]) + assert_raises(ValueError, le.transform, ["london"]) + + +def test_label_encoder_errors(): + """Check that invalid arguments yield ValueError""" + le = LabelEncoder() + assert_raises(ValueError, le.transform, []) + assert_raises(ValueError, le.inverse_transform, []) + + +def test_label_binarizer_iris(): + lb = LabelBinarizer() + Y = lb.fit_transform(iris.target) + clfs = [SGDClassifier().fit(iris.data, Y[:, k]) + for k in range(len(lb.classes_))] + Y_pred = np.array([clf.decision_function(iris.data) for clf in clfs]).T + y_pred = lb.inverse_transform(Y_pred) + accuracy = np.mean(iris.target == y_pred) + y_pred2 = SGDClassifier().fit(iris.data, iris.target).predict(iris.data) + accuracy2 = np.mean(iris.target == y_pred2) + assert_almost_equal(accuracy, accuracy2) + + +def test_label_binarizer_multilabel_unlabeled(): + """Check that LabelBinarizer can handle an unlabeled sample""" + lb = LabelBinarizer() + y = [[1, 2], [1], []] + Y = np.array([[1, 1], + [1, 0], + [0, 0]]) + assert_array_equal(lb.fit_transform(y), Y) + + +def test_center_kernel(): + """Test that KernelCenterer is equivalent to StandardScaler + in feature space""" + rng = np.random.RandomState(0) + X_fit = rng.random_sample((5, 4)) + scaler = StandardScaler(with_std=False) + scaler.fit(X_fit) + X_fit_centered = scaler.transform(X_fit) + K_fit = np.dot(X_fit, X_fit.T) + + # center fit time matrix + centerer = KernelCenterer() + K_fit_centered = np.dot(X_fit_centered, X_fit_centered.T) + K_fit_centered2 = centerer.fit_transform(K_fit) + assert_array_almost_equal(K_fit_centered, K_fit_centered2) + + # center predict time matrix + X_pred = rng.random_sample((2, 4)) + K_pred = np.dot(X_pred, X_fit.T) + X_pred_centered = scaler.transform(X_pred) + K_pred_centered = np.dot(X_pred_centered, X_fit_centered.T) + K_pred_centered2 = centerer.transform(K_pred) + assert_array_almost_equal(K_pred_centered, K_pred_centered2) + + +def test_fit_transform(): + rng = np.random.RandomState(0) + X = rng.random_sample((5, 4)) + for obj in ((StandardScaler(), Normalizer(), Binarizer())): + X_transformed = obj.fit(X).transform(X) + X_transformed2 = obj.fit_transform(X) + assert_array_equal(X_transformed, X_transformed2) + + +def test_add_dummy_feature(): + X = [[1, 0], [0, 1], [0, 1]] + X = add_dummy_feature(X) + assert_array_equal(X, [[1, 1, 0], [1, 0, 1], [1, 0, 1]]) + + +def test_add_dummy_feature_coo(): + X = sp.coo_matrix([[1, 0], [0, 1], [0, 1]]) + X = add_dummy_feature(X) + assert_true(sp.isspmatrix_coo(X), X) + assert_array_equal(X.toarray(), [[1, 1, 0], [1, 0, 1], [1, 0, 1]]) + + +def test_add_dummy_feature_csc(): + X = sp.csc_matrix([[1, 0], [0, 1], [0, 1]]) + X = add_dummy_feature(X) + assert_true(sp.isspmatrix_csc(X), X) + assert_array_equal(X.toarray(), [[1, 1, 0], [1, 0, 1], [1, 0, 1]]) + + +def test_add_dummy_feature_csr(): + X = sp.csr_matrix([[1, 0], [0, 1], [0, 1]]) + X = add_dummy_feature(X) + assert_true(sp.isspmatrix_csr(X), X) + assert_array_equal(X.toarray(), [[1, 1, 0], [1, 0, 1], [1, 0, 1]]) From cc8d264d21a795c7d1be06d274cdef7da6a8b457 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 15 Feb 2017 15:36:16 +0100 Subject: [PATCH 002/106] move the code in the pre-processing module --- sklearn/preprocessing.py | 1433 ---------------------- sklearn/preprocessing/data.py | 115 ++ sklearn/preprocessing/tests/test_data.py | 38 +- sklearn/tests/test_preprocessing.py | 841 ------------- 4 files changed, 152 insertions(+), 2275 deletions(-) delete mode 100644 sklearn/preprocessing.py delete mode 100644 sklearn/tests/test_preprocessing.py diff --git a/sklearn/preprocessing.py b/sklearn/preprocessing.py deleted file mode 100644 index 1de57535ec14f..0000000000000 --- a/sklearn/preprocessing.py +++ /dev/null @@ -1,1433 +0,0 @@ -# Authors: Alexandre Gramfort -# Mathieu Blondel -# Olivier Grisel -# Andreas Mueller -# License: BSD 3 clause - -import warnings -import numbers - -import numpy as np -import scipy.sparse as sp - -from numpy.testing import assert_almost_equal - -from .base import BaseEstimator, TransformerMixin -from .externals.six import string_types -from .utils import check_arrays -from .utils import array2d -from .utils import atleast2d_or_csr -from .utils import atleast2d_or_csc -from .utils import safe_asarray -from .utils import warn_if_not_float -from .utils.fixes import unique - -from .utils.multiclass import unique_labels -from .utils.multiclass import is_multilabel -from .utils.multiclass import type_of_target - -from .utils.sparsefuncs import inplace_csr_row_normalize_l1 -from .utils.sparsefuncs import inplace_csr_row_normalize_l2 -from .utils.sparsefuncs import inplace_csr_column_scale -from .utils.sparsefuncs import mean_variance_axis0 -from .externals import six - -zip = six.moves.zip -map = six.moves.map - -__all__ = ['Binarizer', - 'KernelCenterer', - 'LabelBinarizer', - 'LabelEncoder', - 'MinMaxScaler', - 'Normalizer', - 'OneHotEncoder', - 'StandardScaler', - 'binarize', - 'normalize', - 'scale'] - - -def _mean_and_std(X, axis=0, with_mean=True, with_std=True): - """Compute mean and std deviation for centering, scaling. - - Zero valued std components are reset to 1.0 to avoid NaNs when scaling. - """ - X = np.asarray(X) - Xr = np.rollaxis(X, axis) - - if with_mean: - mean_ = Xr.mean(axis=0) - else: - mean_ = None - - if with_std: - std_ = Xr.std(axis=0) - if isinstance(std_, np.ndarray): - std_[std_ == 0.0] = 1.0 - elif std_ == 0.: - std_ = 1. - else: - std_ = None - - return mean_, std_ - - -def scale(X, axis=0, with_mean=True, with_std=True, copy=True): - """Standardize a dataset along any axis - - Center to the mean and component wise scale to unit variance. - - Parameters - ---------- - X : array-like or CSR matrix. - The data to center and scale. - - axis : int (0 by default) - axis used to compute the means and standard deviations along. If 0, - independently standardize each feature, otherwise (if 1) standardize - each sample. - - with_mean : boolean, True by default - If True, center the data before scaling. - - with_std : boolean, True by default - If True, scale the data to unit variance (or equivalently, - unit standard deviation). - - copy : boolean, optional, default is True - set to False to perform inplace row normalization and avoid a - copy (if the input is already a numpy array or a scipy.sparse - CSR matrix and if axis is 1). - - Notes - ----- - This implementation will refuse to center scipy.sparse matrices - since it would make them non-sparse and would potentially crash the - program with memory exhaustion problems. - - Instead the caller is expected to either set explicitly - `with_mean=False` (in that case, only variance scaling will be - performed on the features of the CSR matrix) or to call `X.toarray()` - if he/she expects the materialized dense array to fit in memory. - - To avoid memory copy the caller should pass a CSR matrix. - - See also - -------- - :class:`sklearn.preprocessing.StandardScaler` to perform centering and - scaling using the ``Transformer`` API (e.g. as part of a preprocessing - :class:`sklearn.pipeline.Pipeline`) - """ - if sp.issparse(X): - if with_mean: - raise ValueError( - "Cannot center sparse matrices: pass `with_mean=False` instead" - " See docstring for motivation and alternatives.") - if axis != 0: - raise ValueError("Can only scale sparse matrix on axis=0, " - " got axis=%d" % axis) - warn_if_not_float(X, estimator='The scale function') - if not sp.isspmatrix_csr(X): - X = X.tocsr() - copy = False - if copy: - X = X.copy() - _, var = mean_variance_axis0(X) - var[var == 0.0] = 1.0 - inplace_csr_column_scale(X, 1 / np.sqrt(var)) - else: - X = np.asarray(X) - warn_if_not_float(X, estimator='The scale function') - mean_, std_ = _mean_and_std( - X, axis, with_mean=with_mean, with_std=with_std) - if copy: - X = X.copy() - # Xr is a view on the original array that enables easy use of - # broadcasting on the axis in which we are interested in - Xr = np.rollaxis(X, axis) - if with_mean: - Xr -= mean_ - if with_std: - Xr /= std_ - return X - - -class MinMaxScaler(BaseEstimator, TransformerMixin): - """Standardizes features by scaling each feature to a given range. - - This estimator scales and translates each feature individually such - that it is in the given range on the training set, i.e. between - zero and one. - - The standardization is given by:: - X_std = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0)) - X_scaled = X_std * (max - min) + min - - where min, max = feature_range. - - This standardization is often used as an alternative to zero mean, - unit variance scaling. - - Parameters - ---------- - feature_range: tuple (min, max), default=(0, 1) - Desired range of transformed data. - - copy : boolean, optional, default is True - Set to False to perform inplace row normalization and avoid a - copy (if the input is already a numpy array). - - Attributes - ---------- - `min_` : ndarray, shape (n_features,) - Per feature adjustment for minimum. - - `scale_` : ndarray, shape (n_features,) - Per feature relative scaling of the data. - """ - - def __init__(self, feature_range=(0, 1), copy=True): - self.feature_range = feature_range - self.copy = copy - - def fit(self, X, y=None): - """Compute the minimum and maximum to be used for later scaling. - - Parameters - ---------- - X : array-like, shape [n_samples, n_features] - The data used to compute the per-feature minimum and maximum - used for later scaling along the features axis. - """ - X = check_arrays(X, sparse_format="dense", copy=self.copy)[0] - warn_if_not_float(X, estimator=self) - feature_range = self.feature_range - if feature_range[0] >= feature_range[1]: - raise ValueError("Minimum of desired feature range must be smaller" - " than maximum. Got %s." % str(feature_range)) - data_min = np.min(X, axis=0) - data_range = np.max(X, axis=0) - data_min - # Do not scale constant features - data_range[data_range == 0.0] = 1.0 - self.scale_ = (feature_range[1] - feature_range[0]) / data_range - self.min_ = feature_range[0] - data_min * self.scale_ - self.data_range = data_range - self.data_min = data_min - return self - - def transform(self, X): - """Scaling features of X according to feature_range. - - Parameters - ---------- - X : array-like with shape [n_samples, n_features] - Input data that will be transformed. - """ - X = check_arrays(X, sparse_format="dense", copy=self.copy)[0] - X *= self.scale_ - X += self.min_ - return X - - def inverse_transform(self, X): - """Undo the scaling of X according to feature_range. - - Parameters - ---------- - X : array-like with shape [n_samples, n_features] - Input data that will be transformed. - """ - X = check_arrays(X, sparse_format="dense", copy=self.copy)[0] - X -= self.min_ - X /= self.scale_ - return X - - -class StandardScaler(BaseEstimator, TransformerMixin): - """Standardize features by removing the mean and scaling to unit variance - - Centering and scaling happen independently on each feature by computing - the relevant statistics on the samples in the training set. Mean and - standard deviation are then stored to be used on later data using the - `transform` method. - - Standardization of a dataset is a common requirement for many - machine learning estimators: they might behave badly if the - individual feature do not more or less look like standard normally - distributed data (e.g. Gaussian with 0 mean and unit variance). - - For instance many elements used in the objective function of - a learning algorithm (such as the RBF kernel of Support Vector - Machines or the L1 and L2 regularizers of linear models) assume that - all features are centered around 0 and have variance in the same - order. If a feature has a variance that is orders of magnitude larger - that others, it might dominate the objective function and make the - estimator unable to learn from other features correctly as expected. - - Parameters - ---------- - with_mean : boolean, True by default - If True, center the data before scaling. - This does not work (and will raise an exception) when attempted on - sparse matrices, because centering them entails building a dense - matrix which in common use cases is likely to be too large to fit in - memory. - - with_std : boolean, True by default - If True, scale the data to unit variance (or equivalently, - unit standard deviation). - - copy : boolean, optional, default is True - If False, try to avoid a copy and do inplace scaling instead. - This is not guaranteed to always work inplace; e.g. if the data is - not a NumPy array or scipy.sparse CSR matrix, a copy may still be - returned. - - Attributes - ---------- - `mean_` : array of floats with shape [n_features] - The mean value for each feature in the training set. - - `std_` : array of floats with shape [n_features] - The standard deviation for each feature in the training set. - - See also - -------- - :func:`sklearn.preprocessing.scale` to perform centering and - scaling without using the ``Transformer`` object oriented API - - :class:`sklearn.preprocessing.RankScaler` to perform standardization - that is more robust to outliers, but slower and more memory-intensive. - - :class:`sklearn.decomposition.RandomizedPCA` with `whiten=True` - to further remove the linear correlation across features. - """ - - def __init__(self, copy=True, with_mean=True, with_std=True): - self.with_mean = with_mean - self.with_std = with_std - self.copy = copy - - def fit(self, X, y=None): - """Compute the mean and std to be used for later scaling. - - Parameters - ---------- - X : array-like or CSR matrix with shape [n_samples, n_features] - The data used to compute the mean and standard deviation - used for later scaling along the features axis. - """ - X = check_arrays(X, copy=self.copy, sparse_format="csr")[0] - if sp.issparse(X): - if self.with_mean: - raise ValueError( - "Cannot center sparse matrices: pass `with_mean=False` " - "instead. See docstring for motivation and alternatives.") - warn_if_not_float(X, estimator=self) - self.mean_ = None - - if self.with_std: - var = mean_variance_axis0(X)[1] - self.std_ = np.sqrt(var) - self.std_[var == 0.0] = 1.0 - else: - self.std_ = None - return self - else: - warn_if_not_float(X, estimator=self) - self.mean_, self.std_ = _mean_and_std( - X, axis=0, with_mean=self.with_mean, with_std=self.with_std) - return self - - def transform(self, X, y=None, copy=None): - """Perform standardization by centering and scaling - - Parameters - ---------- - X : array-like with shape [n_samples, n_features] - The data used to scale along the features axis. - """ - copy = copy if copy is not None else self.copy - X = check_arrays(X, copy=copy, sparse_format="csr")[0] - if sp.issparse(X): - if self.with_mean: - raise ValueError( - "Cannot center sparse matrices: pass `with_mean=False` " - "instead See docstring for motivation and alternatives.") - if self.std_ is not None: - warn_if_not_float(X, estimator=self) - inplace_csr_column_scale(X, 1 / self.std_) - else: - warn_if_not_float(X, estimator=self) - if self.with_mean: - X -= self.mean_ - if self.with_std: - X /= self.std_ - return X - - def inverse_transform(self, X, copy=None): - """Scale back the data to the original representation - - Parameters - ---------- - X : array-like with shape [n_samples, n_features] - The data used to scale along the features axis. - """ - copy = copy if copy is not None else self.copy - if sp.issparse(X): - if self.with_mean: - raise ValueError( - "Cannot uncenter sparse matrices: pass `with_mean=False` " - "instead See docstring for motivation and alternatives.") - if not sp.isspmatrix_csr(X): - X = X.tocsr() - copy = False - if copy: - X = X.copy() - if self.std_ is not None: - inplace_csr_column_scale(X, self.std_) - else: - X = np.asarray(X) - if copy: - X = X.copy() - if self.with_std: - X *= self.std_ - if self.with_mean: - X += self.mean_ - return X - - -class Scaler(StandardScaler): - def __init__(self, copy=True, with_mean=True, with_std=True): - warnings.warn("Scaler was renamed to StandardScaler. The old name " - " will be removed in 0.15.", DeprecationWarning) - super(Scaler, self).__init__(copy, with_mean, with_std) - - -class RankScaler(BaseEstimator, TransformerMixin): - """Rank-standardize features to a percentile, in the range [0, 1]. - - Rank-scaling happens independently on each feature, by determining - the percentile of the feature value. - A feature value that is smaller than observed during fitting - will scale to 0. - A feature value that is larger than observed during fitting - will scale to 1. - A feature value that is the median will scale to 0.5. - - Standardization of a dataset is a common requirement for many - machine learning estimators. Rank-scaling is useful when - estimators perform badly on StandardScalar features. Rank-scaling - is more robust than StandardScaler, because outliers can't have - large values post scaling. It is an empirical question whether - you want outliers to be given high importance (StandardScaler) - or not (RankScaler). - - Parameters - ---------- - n_ranks : int, 1000 by default - The number of different ranks possible. - i.e. The number of indices in the compressed ranking matrix - `sort_X_`. - This is an approximation, to save memory and transform - computation time. - e.g. if 1000, transformed values will have resolution 0.001. - If `None`, we store the full size matrix, comparable - in size to the initial fit `X`. - - Attributes - ---------- - `sort_X_` : array of ints, shape (n_samples, n_features) - The rank-index of every feature in the fit X. - - See also - -------- - :class:`sklearn.preprocessing.StandardScaler` to perform standardization - that is faster, but less robust to outliers. - """ - - def __init__(self, n_ranks=1000): - # TODO: Add min and max parameters? Default = [0, 1] - self.n_ranks = n_ranks - - def fit(self, X, y=None): - """Compute the feature ranks for later scaling. - - fit will take time O(n_features * n_samples * log(n_samples)), - because it must sort the entire matrix. - - It use memory O(n_features * n_ranks). - - Parameters - ---------- - X : array-like, shape (n_samples, n_features) - The data used to compute feature ranks. - """ - X = array2d(X) - n_samples, n_features = X.shape - full_sort_X_ = np.sort(X, axis=0) - if not self.n_ranks or self.n_ranks >= n_samples: - # Store the full matrix - self.sort_X_ = full_sort_X_ - else: - # Approximate the stored sort_X_ - self.sort_X_ = np.zeros((self.n_ranks, n_features)) - for i in range(self.n_ranks): - for j in range(n_features): - # Find the corresponding i in the original ranking - iorig = i * 1. * n_samples / self.n_ranks - ioriglo = int(iorig) - iorighi = ioriglo + 1 - - if ioriglo == n_samples: - self.sort_X_[i, j] = full_sort_X_[ioriglo, j] - else: - # And use linear interpolation to combine the - # original values. - wlo = (1 - (iorig - ioriglo)) - whi = (1 - (iorighi - iorig)) - assert wlo >= 0 and wlo <= 1 - assert whi >= 0 and whi <= 1 - assert_almost_equal(wlo+whi, 1.) - self.sort_X_[i, j] = wlo * full_sort_X_[ioriglo, j] \ - + whi * full_sort_X_[iorighi, j] - return self - - def transform(self, X): - """Perform rank-standardization. - - transform will take O(n_features * n_samples * log(n_ranks)), - where `n_fit_samples` is the number of samples used during `fit`. - - Parameters - ---------- - X : array-like, shape (n_samples, n_features) - The data used to scale along the features axis. - """ - X = array2d(X) - warn_if_not_float(X, estimator=self) - # TODO: Can add a copy parameter, and simply overwrite X if copy=False - X2 = np.zeros(X.shape) - for j in range(X.shape[1]): - lidx = np.searchsorted(self.sort_X_[:, j], X[:, j], side='left') - ridx = np.searchsorted(self.sort_X_[:, j], X[:, j], side='right') - v = 1. * (lidx + ridx) / (2 * self.sort_X_.shape[0]) - X2[:,j] = v - return X2 - - # TODO : Add inverse_transform method. - # I believe we could reuse the approximation code in `fit`. - -def normalize(X, norm='l2', axis=1, copy=True): - """Normalize a dataset along any axis - - Parameters - ---------- - X : array or scipy.sparse matrix with shape [n_samples, n_features] - The data to normalize, element by element. - scipy.sparse matrices should be in CSR format to avoid an - un-necessary copy. - - norm : 'l1' or 'l2', optional ('l2' by default) - The norm to use to normalize each non zero sample (or each non-zero - feature if axis is 0). - - axis : 0 or 1, optional (1 by default) - axis used to normalize the data along. If 1, independently normalize - each sample, otherwise (if 0) normalize each feature. - - copy : boolean, optional, default is True - set to False to perform inplace row normalization and avoid a - copy (if the input is already a numpy array or a scipy.sparse - CSR matrix and if axis is 1). - - See also - -------- - :class:`sklearn.preprocessing.Normalizer` to perform normalization - using the ``Transformer`` API (e.g. as part of a preprocessing - :class:`sklearn.pipeline.Pipeline`) - """ - if norm not in ('l1', 'l2'): - raise ValueError("'%s' is not a supported norm" % norm) - - if axis == 0: - sparse_format = 'csc' - elif axis == 1: - sparse_format = 'csr' - else: - raise ValueError("'%d' is not a supported axis" % axis) - - X = check_arrays(X, sparse_format=sparse_format, copy=copy)[0] - warn_if_not_float(X, 'The normalize function') - if axis == 0: - X = X.T - - if sp.issparse(X): - if norm == 'l1': - inplace_csr_row_normalize_l1(X) - elif norm == 'l2': - inplace_csr_row_normalize_l2(X) - else: - if norm == 'l1': - norms = np.abs(X).sum(axis=1)[:, np.newaxis] - norms[norms == 0.0] = 1.0 - elif norm == 'l2': - norms = np.sqrt(np.sum(X ** 2, axis=1))[:, np.newaxis] - norms[norms == 0.0] = 1.0 - X /= norms - - if axis == 0: - X = X.T - - return X - - -class Normalizer(BaseEstimator, TransformerMixin): - """Normalize samples individually to unit norm - - Each sample (i.e. each row of the data matrix) with at least one - non zero component is rescaled independently of other samples so - that its norm (l1 or l2) equals one. - - This transformer is able to work both with dense numpy arrays and - scipy.sparse matrix (use CSR format if you want to avoid the burden of - a copy / conversion). - - Scaling inputs to unit norms is a common operation for text - classification or clustering for instance. For instance the dot - product of two l2-normalized TF-IDF vectors is the cosine similarity - of the vectors and is the base similarity metric for the Vector - Space Model commonly used by the Information Retrieval community. - - Parameters - ---------- - norm : 'l1' or 'l2', optional ('l2' by default) - The norm to use to normalize each non zero sample. - - copy : boolean, optional, default is True - set to False to perform inplace row normalization and avoid a - copy (if the input is already a numpy array or a scipy.sparse - CSR matrix). - - Notes - ----- - This estimator is stateless (besides constructor parameters), the - fit method does nothing but is useful when used in a pipeline. - - See also - -------- - :func:`sklearn.preprocessing.normalize` equivalent function - without the object oriented API - """ - - def __init__(self, norm='l2', copy=True): - self.norm = norm - self.copy = copy - - def fit(self, X, y=None): - """Do nothing and return the estimator unchanged - - This method is just there to implement the usual API and hence - work in pipelines. - """ - atleast2d_or_csr(X) - return self - - def transform(self, X, y=None, copy=None): - """Scale each non zero row of X to unit norm - - Parameters - ---------- - X : array or scipy.sparse matrix with shape [n_samples, n_features] - The data to normalize, row by row. scipy.sparse matrices should be - in CSR format to avoid an un-necessary copy. - """ - copy = copy if copy is not None else self.copy - atleast2d_or_csr(X) - return normalize(X, norm=self.norm, axis=1, copy=copy) - - -def binarize(X, threshold=0.0, copy=True): - """Boolean thresholding of array-like or scipy.sparse matrix - - Parameters - ---------- - X : array or scipy.sparse matrix with shape [n_samples, n_features] - The data to binarize, element by element. - scipy.sparse matrices should be in CSR or CSC format to avoid an - un-necessary copy. - - threshold : float, optional (0.0 by default) - Feature values below this are replaced by 1, above it by 0. - Threshold may not be less than 0 for operations on sparse matrices. - - copy : boolean, optional, default is True - set to False to perform inplace binarization and avoid a copy - (if the input is already a numpy array or a scipy.sparse CSR / CSC - matrix and if axis is 1). - - See also - -------- - :class:`sklearn.preprocessing.Binarizer` to perform binarization - using the ``Transformer`` API (e.g. as part of a preprocessing - :class:`sklearn.pipeline.Pipeline`) - """ - sparse_format = "csr" # We force sparse format to be either csr or csc. - if hasattr(X, "format"): - if X.format in ["csr", "csc"]: - sparse_format = X.format - - X = check_arrays(X, sparse_format=sparse_format, copy=copy)[0] - if sp.issparse(X): - if threshold < 0: - raise ValueError('Cannot binarize a sparse matrix with threshold ' - '< 0') - cond = X.data > threshold - not_cond = np.logical_not(cond) - X.data[cond] = 1 - X.data[not_cond] = 0 - X.eliminate_zeros() - else: - cond = X > threshold - not_cond = np.logical_not(cond) - X[cond] = 1 - X[not_cond] = 0 - return X - - -class Binarizer(BaseEstimator, TransformerMixin): - """Binarize data (set feature values to 0 or 1) according to a threshold - - Values greater than the threshold map to 1, while values less than - or equal to the threshold map to 0. With the default threshold of 0, - only positive values map to 1. - - Binarization is a common operation on text count data where the - analyst can decide to only consider the presence or absence of a - feature rather than a quantified number of occurrences for instance. - - It can also be used as a pre-processing step for estimators that - consider boolean random variables (e.g. modelled using the Bernoulli - distribution in a Bayesian setting). - - Parameters - ---------- - threshold : float, optional (0.0 by default) - Feature values below this are replaced by 1, above it by 0. - Threshold may not be less than 0 for operations on sparse matrices. - - copy : boolean, optional, default is True - set to False to perform inplace binarization and avoid a copy (if - the input is already a numpy array or a scipy.sparse CSR matrix). - - Notes - ----- - If the input is a sparse matrix, only the non-zero values are subject - to update by the Binarizer class. - - This estimator is stateless (besides constructor parameters), the - fit method does nothing but is useful when used in a pipeline. - """ - - def __init__(self, threshold=0.0, copy=True): - self.threshold = threshold - self.copy = copy - - def fit(self, X, y=None): - """Do nothing and return the estimator unchanged - - This method is just there to implement the usual API and hence - work in pipelines. - """ - atleast2d_or_csr(X) - return self - - def transform(self, X, y=None, copy=None): - """Binarize each element of X - - Parameters - ---------- - X : array or scipy.sparse matrix with shape [n_samples, n_features] - The data to binarize, element by element. - scipy.sparse matrices should be in CSR format to avoid an - un-necessary copy. - """ - copy = copy if copy is not None else self.copy - return binarize(X, threshold=self.threshold, copy=copy) - - -def _transform_selected(X, transform, selected="all", copy=True): - """Apply a transform function to portion of selected features - - Parameters - ---------- - X : array-like or sparse matrix, shape=(n_samples, n_features) - Dense array or sparse matrix. - - transform : callable - A callable transform(X) -> X_transformed - - copy : boolean, optional - Copy X even if it could be avoided. - - selected: "all" or array of indices or mask - Specify which features to apply the transform to. - - Returns - ------- - X : array or sparse matrix, shape=(n_samples, n_features_new) - """ - if selected == "all": - return transform(X) - - X = atleast2d_or_csc(X, copy=copy) - - if len(selected) == 0: - return X - - n_features = X.shape[1] - ind = np.arange(n_features) - sel = np.zeros(n_features, dtype=bool) - sel[np.asarray(selected)] = True - not_sel = np.logical_not(sel) - n_selected = np.sum(sel) - - if n_selected == 0: - # No features selected. - return X - elif n_selected == n_features: - # All features selected. - return transform(X) - else: - X_sel = transform(X[:, ind[sel]]) - X_not_sel = X[:, ind[not_sel]] - - if sp.issparse(X_sel) or sp.issparse(X_not_sel): - return sp.hstack((X_sel, X_not_sel)) - else: - return np.hstack((X_sel, X_not_sel)) - - -class OneHotEncoder(BaseEstimator, TransformerMixin): - """Encode categorical integer features using a one-hot aka one-of-K scheme. - - The input to this transformer should be a matrix of integers, denoting - the values taken on by categorical (discrete) features. The output will be - a sparse matrix were each column corresponds to one possible value of one - feature. It is assumed that input features take on values in the range - [0, n_values). - - This encoding is needed for feeding categorical data to many scikit-learn - estimators, notably linear models and SVMs with the standard kernels. - - Parameters - ---------- - n_values : 'auto', int or array of ints - Number of values per feature. - - - 'auto' : determine value range from training data. - - int : maximum value for all features. - - array : maximum value per feature. - - categorical_features: "all" or array of indices or mask - Specify what features are treated as categorical. - - - 'all' (default): All features are treated as categorical. - - array of indices: Array of categorical feature indices. - - mask: Array of length n_features and with dtype=bool. - - Non-categorical features are always stacked to the right of the matrix. - - dtype : number type, default=np.float - Desired dtype of output. - - Attributes - ---------- - `active_features_` : array - Indices for active features, meaning values that actually occur - in the training set. Only available when n_values is ``'auto'``. - - `feature_indices_` : array of shape (n_features,) - Indices to feature ranges. - Feature ``i`` in the original data is mapped to features - from ``feature_indices_[i]`` to ``feature_indices_[i+1]`` - (and then potentially masked by `active_features_` afterwards) - - `n_values_` : array of shape (n_features,) - Maximum number of values per feature. - - Examples - -------- - Given a dataset with three features and two samples, we let the encoder - find the maximum value per feature and transform the data to a binary - one-hot encoding. - - >>> from sklearn.preprocessing import OneHotEncoder - >>> enc = OneHotEncoder() - >>> enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], \ -[1, 0, 2]]) # doctest: +ELLIPSIS - OneHotEncoder(categorical_features='all', dtype=<... 'float'>, - n_values='auto') - >>> enc.n_values_ - array([2, 3, 4]) - >>> enc.feature_indices_ - array([0, 2, 5, 9]) - >>> enc.transform([[0, 1, 1]]).toarray() - array([[ 1., 0., 0., 1., 0., 0., 1., 0., 0.]]) - - See also - -------- - sklearn.feature_extraction.DictVectorizer : performs a one-hot encoding of - dictionary items (also handles string-valued features). - sklearn.feature_extraction.FeatureHasher : performs an approximate one-hot - encoding of dictionary items or strings. - """ - def __init__(self, n_values="auto", categorical_features="all", - dtype=np.float): - self.n_values = n_values - self.categorical_features = categorical_features - self.dtype = dtype - - def fit(self, X, y=None): - """Fit OneHotEncoder to X. - - Parameters - ---------- - X : array-like, shape=(n_samples, n_feature) - Input array of type int. - - Returns - ------- - self - """ - self.fit_transform(X) - return self - - def _fit_transform(self, X): - """Assumes X contains only categorical features.""" - X = check_arrays(X, sparse_format='dense', dtype=np.int)[0] - if np.any(X < 0): - raise ValueError("X needs to contain only non-negative integers.") - n_samples, n_features = X.shape - if self.n_values == 'auto': - n_values = np.max(X, axis=0) + 1 - elif isinstance(self.n_values, numbers.Integral): - n_values = np.empty(n_features, dtype=np.int) - n_values.fill(self.n_values) - else: - try: - n_values = np.asarray(self.n_values, dtype=int) - except (ValueError, TypeError): - raise TypeError("Wrong type for parameter `n_values`. Expected" - " 'auto', int or array of ints, got %r" - % type(X)) - if n_values.ndim < 1 or n_values.shape[0] != X.shape[1]: - raise ValueError("Shape mismatch: if n_values is an array," - " it has to be of shape (n_features,).") - self.n_values_ = n_values - n_values = np.hstack([[0], n_values]) - indices = np.cumsum(n_values) - self.feature_indices_ = indices - - column_indices = (X + indices[:-1]).ravel() - row_indices = np.repeat(np.arange(n_samples, dtype=np.int32), - n_features) - data = np.ones(n_samples * n_features) - out = sp.coo_matrix((data, (row_indices, column_indices)), - shape=(n_samples, indices[-1]), - dtype=self.dtype).tocsr() - - if self.n_values == 'auto': - mask = np.array(out.sum(axis=0)).ravel() != 0 - active_features = np.where(mask)[0] - out = out[:, active_features] - self.active_features_ = active_features - - return out - - def fit_transform(self, X, y=None): - """Fit OneHotEncoder to X, then transform X. - - Equivalent to self.fit(X).transform(X), but more convenient and more - efficient. See fit for the parameters, transform for the return value. - """ - return _transform_selected(X, self._fit_transform, - self.categorical_features, copy=True) - - def _transform(self, X): - """Asssumes X contains only categorical features.""" - X = check_arrays(X, sparse_format='dense', dtype=np.int)[0] - if np.any(X < 0): - raise ValueError("X needs to contain only non-negative integers.") - n_samples, n_features = X.shape - - indices = self.feature_indices_ - if n_features != indices.shape[0] - 1: - raise ValueError("X has different shape than during fitting." - " Expected %d, got %d." - % (indices.shape[0] - 1, n_features)) - - n_values_check = np.max(X, axis=0) + 1 - if (n_values_check > self.n_values_).any(): - raise ValueError("Feature out of bounds. Try setting n_values.") - - column_indices = (X + indices[:-1]).ravel() - row_indices = np.repeat(np.arange(n_samples, dtype=np.int32), - n_features) - data = np.ones(n_samples * n_features) - out = sp.coo_matrix((data, (row_indices, column_indices)), - shape=(n_samples, indices[-1]), - dtype=self.dtype).tocsr() - if self.n_values == 'auto': - out = out[:, self.active_features_] - return out - - def transform(self, X): - """Transform X using one-hot encoding. - - Parameters - ---------- - X : array-like, shape=(n_samples, n_features) - Input array of type int. - - Returns - ------- - X_out : sparse matrix, dtype=int - Transformed input. - """ - return _transform_selected(X, self._transform, - self.categorical_features, copy=True) - - -class LabelEncoder(BaseEstimator, TransformerMixin): - """Encode labels with value between 0 and n_classes-1. - - Attributes - ---------- - `classes_`: array of shape [n_class] - Holds the label for each class. - - Examples - -------- - `LabelEncoder` can be used to normalize labels. - - >>> from sklearn import preprocessing - >>> le = preprocessing.LabelEncoder() - >>> le.fit([1, 2, 2, 6]) - LabelEncoder() - >>> le.classes_ - array([1, 2, 6]) - >>> le.transform([1, 1, 2, 6]) #doctest: +ELLIPSIS - array([0, 0, 1, 2]...) - >>> le.inverse_transform([0, 0, 1, 2]) - array([1, 1, 2, 6]) - - It can also be used to transform non-numerical labels (as long as they are - hashable and comparable) to numerical labels. - - >>> le = preprocessing.LabelEncoder() - >>> le.fit(["paris", "paris", "tokyo", "amsterdam"]) - LabelEncoder() - >>> list(le.classes_) - ['amsterdam', 'paris', 'tokyo'] - >>> le.transform(["tokyo", "tokyo", "paris"]) #doctest: +ELLIPSIS - array([2, 2, 1]...) - >>> list(le.inverse_transform([2, 2, 1])) - ['tokyo', 'tokyo', 'paris'] - - """ - - def _check_fitted(self): - if not hasattr(self, "classes_"): - raise ValueError("LabelNormalizer was not fitted yet.") - - def fit(self, y): - """Fit label encoder - - Parameters - ---------- - y : array-like of shape [n_samples] - Target values. - - Returns - ------- - self : returns an instance of self. - """ - self.classes_ = np.unique(y) - return self - - def fit_transform(self, y): - """Fit label encoder and return encoded labels - - Parameters - ---------- - y : array-like of shape [n_samples] - Target values. - - Returns - ------- - y : array-like of shape [n_samples] - """ - self.classes_, y = unique(y, return_inverse=True) - return y - - def transform(self, y): - """Transform labels to normalized encoding. - - Parameters - ---------- - y : array-like of shape [n_samples] - Target values. - - Returns - ------- - y : array-like of shape [n_samples] - """ - self._check_fitted() - - classes = np.unique(y) - if len(np.intersect1d(classes, self.classes_)) < len(classes): - diff = np.setdiff1d(classes, self.classes_) - raise ValueError("y contains new labels: %s" % str(diff)) - - return np.searchsorted(self.classes_, y) - - def inverse_transform(self, y): - """Transform labels back to original encoding. - - Parameters - ---------- - y : numpy array of shape [n_samples] - Target values. - - Returns - ------- - y : numpy array of shape [n_samples] - """ - self._check_fitted() - - y = np.asarray(y) - return self.classes_[y] - - -class LabelBinarizer(BaseEstimator, TransformerMixin): - """Binarize labels in a one-vs-all fashion - - Several regression and binary classification algorithms are - available in the scikit. A simple way to extend these algorithms - to the multi-class classification case is to use the so-called - one-vs-all scheme. - - At learning time, this simply consists in learning one regressor - or binary classifier per class. In doing so, one needs to convert - multi-class labels to binary labels (belong or does not belong - to the class). LabelBinarizer makes this process easy with the - transform method. - - At prediction time, one assigns the class for which the corresponding - model gave the greatest confidence. LabelBinarizer makes this easy - with the inverse_transform method. - - Parameters - ---------- - - neg_label: int (default: 0) - Value with which negative labels must be encoded. - - pos_label: int (default: 1) - Value with which positive labels must be encoded. - - Attributes - ---------- - `classes_`: array of shape [n_class] - Holds the label for each class. - - Examples - -------- - >>> from sklearn import preprocessing - >>> lb = preprocessing.LabelBinarizer() - >>> lb.fit([1, 2, 6, 4, 2]) - LabelBinarizer(neg_label=0, pos_label=1) - >>> lb.classes_ - array([1, 2, 4, 6]) - >>> lb.transform([1, 6]) - array([[1, 0, 0, 0], - [0, 0, 0, 1]]) - - >>> lb.fit_transform([(1, 2), (3,)]) - array([[1, 1, 0], - [0, 0, 1]]) - >>> lb.classes_ - array([1, 2, 3]) - """ - - def __init__(self, neg_label=0, pos_label=1): - if neg_label >= pos_label: - raise ValueError("neg_label must be strictly less than pos_label.") - - self.neg_label = neg_label - self.pos_label = pos_label - - def _check_fitted(self): - if not hasattr(self, "classes_"): - raise ValueError("LabelBinarizer was not fitted yet.") - - def fit(self, y): - """Fit label binarizer - - Parameters - ---------- - y : numpy array of shape [n_samples] or sequence of sequences - Target values. In the multilabel case the nested sequences can - have variable lengths. - - Returns - ------- - self : returns an instance of self. - """ - y_type = type_of_target(y) - self.multilabel = y_type.startswith('multilabel') - if self.multilabel: - self.indicator_matrix_ = y_type == 'multilabel-indicator' - - self.classes_ = unique_labels(y) - - return self - - def transform(self, y): - """Transform multi-class labels to binary labels - - The output of transform is sometimes referred to by some authors as the - 1-of-K coding scheme. - - Parameters - ---------- - y : numpy array of shape [n_samples] or sequence of sequences - Target values. In the multilabel case the nested sequences can - have variable lengths. - - Returns - ------- - Y : numpy array of shape [n_samples, n_classes] - """ - self._check_fitted() - - y_type = type_of_target(y) - - if self.multilabel or len(self.classes_) > 2: - if y_type == 'multilabel-indicator': - # nothing to do as y is already a label indicator matrix - return y - - Y = np.zeros((len(y), len(self.classes_)), dtype=np.int) - else: - Y = np.zeros((len(y), 1), dtype=np.int) - - Y += self.neg_label - - y_is_multilabel = y_type.startswith('multilabel') - - if y_is_multilabel and not self.multilabel: - raise ValueError("The object was not fitted with multilabel" - " input!") - - elif self.multilabel: - if not y_is_multilabel: - raise ValueError("y should be a list of label lists/tuples," - "got %r" % (y,)) - - # inverse map: label => column index - imap = dict((v, k) for k, v in enumerate(self.classes_)) - - for i, label_tuple in enumerate(y): - for label in label_tuple: - Y[i, imap[label]] = self.pos_label - - return Y - - else: - y = np.asarray(y) - - if len(self.classes_) == 2: - Y[y == self.classes_[1], 0] = self.pos_label - return Y - - elif len(self.classes_) >= 2: - for i, k in enumerate(self.classes_): - Y[y == k, i] = self.pos_label - return Y - - else: - # Only one class, returns a matrix with all negative labels. - return Y - - def inverse_transform(self, Y, threshold=None): - """Transform binary labels back to multi-class labels - - Parameters - ---------- - Y : numpy array of shape [n_samples, n_classes] - Target values. - - threshold : float or None - Threshold used in the binary and multi-label cases. - - Use 0 when: - - Y contains the output of decision_function (classifier) - Use 0.5 when: - - Y contains the output of predict_proba - - If None, the threshold is assumed to be half way between - neg_label and pos_label. - - Returns - ------- - y : numpy array of shape [n_samples] or sequence of sequences - Target values. In the multilabel case the nested sequences can - have variable lengths. - - Notes - ----- - In the case when the binary labels are fractional - (probabilistic), inverse_transform chooses the class with the - greatest value. Typically, this allows to use the output of a - linear model's decision_function method directly as the input - of inverse_transform. - """ - self._check_fitted() - - if threshold is None: - half = (self.pos_label - self.neg_label) / 2.0 - threshold = self.neg_label + half - - if self.multilabel: - Y = np.array(Y > threshold, dtype=int) - # Return the predictions in the same format as in fit - if self.indicator_matrix_: - # Label indicator matrix format - return Y - else: - # Lists of tuples format - return [tuple(self.classes_[np.flatnonzero(Y[i])]) - for i in range(Y.shape[0])] - - if len(Y.shape) == 1 or Y.shape[1] == 1: - y = np.array(Y.ravel() > threshold, dtype=int) - - else: - y = Y.argmax(axis=1) - - return self.classes_[y] - - -class KernelCenterer(BaseEstimator, TransformerMixin): - """Center a kernel matrix - - Let K(x_i, x_j) be a kernel defined by K(x_i, x_j) = phi(x_i)^T phi(x_j), - where phi(x) is a function mapping x to a hilbert space. KernelCenterer is - a class to center (i.e., normalize to have zero-mean) the data without - explicitly computing phi(x). It is equivalent equivalent to centering - phi(x) with sklearn.preprocessing.StandardScaler(with_std=False). - """ - - def fit(self, K, y=None): - """Fit KernelCenterer - - Parameters - ---------- - K : numpy array of shape [n_samples, n_samples] - Kernel matrix. - - Returns - ------- - self : returns an instance of self. - """ - K = array2d(K) - n_samples = K.shape[0] - self.K_fit_rows_ = np.sum(K, axis=0) / n_samples - self.K_fit_all_ = self.K_fit_rows_.sum() / n_samples - return self - - def transform(self, K, y=None, copy=True): - """Center kernel - - Parameters - ---------- - K : numpy array of shape [n_samples1, n_samples2] - Kernel matrix. - - Returns - ------- - K_new : numpy array of shape [n_samples1, n_samples2] - """ - K = array2d(K) - if copy: - K = K.copy() - - K_pred_cols = (np.sum(K, axis=1) / - self.K_fit_rows_.shape[0])[:, np.newaxis] - - K -= self.K_fit_rows_ - K -= K_pred_cols - K += self.K_fit_all_ - - return K - - -def add_dummy_feature(X, value=1.0): - """Augment dataset with an additional dummy feature. - - This is useful for fitting an intercept term with implementations which - cannot otherwise fit it directly. - - Parameters - ---------- - X : array or scipy.sparse matrix with shape [n_samples, n_features] - Data. - - value : float - Value to use for the dummy feature. - - Returns - ------- - - X : array or scipy.sparse matrix with shape [n_samples, n_features + 1] - Same data with dummy feature added as first column. - - Examples - -------- - - >>> from sklearn.preprocessing import add_dummy_feature - >>> add_dummy_feature([[0, 1], [1, 0]]) - array([[ 1., 0., 1.], - [ 1., 1., 0.]]) - """ - X = safe_asarray(X) - n_samples, n_features = X.shape - shape = (n_samples, n_features + 1) - if sp.issparse(X): - if sp.isspmatrix_coo(X): - # Shift columns to the right. - col = X.col + 1 - # Column indices of dummy feature are 0 everywhere. - col = np.concatenate((np.zeros(n_samples), col)) - # Row indices of dummy feature are 0, ..., n_samples-1. - row = np.concatenate((np.arange(n_samples), X.row)) - # Prepend the dummy feature n_samples times. - data = np.concatenate((np.ones(n_samples) * value, X.data)) - return sp.coo_matrix((data, (row, col)), shape) - elif sp.isspmatrix_csc(X): - # Shift index pointers since we need to add n_samples elements. - indptr = X.indptr + n_samples - # indptr[0] must be 0. - indptr = np.concatenate((np.array([0]), indptr)) - # Row indices of dummy feature are 0, ..., n_samples-1. - indices = np.concatenate((np.arange(n_samples), X.indices)) - # Prepend the dummy feature n_samples times. - data = np.concatenate((np.ones(n_samples) * value, X.data)) - return sp.csc_matrix((data, indices, indptr), shape) - else: - klass = X.__class__ - return klass(add_dummy_feature(X.tocoo(), value)) - else: - return np.hstack((np.ones((n_samples, 1)) * value, X)) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 093137d078000..519f1233e0c18 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -1901,3 +1901,118 @@ def transform(self, X): """ return _transform_selected(X, self._transform, self.categorical_features, copy=True) + + +class RankScaler(BaseEstimator, TransformerMixin): + """Rank-standardize features to a percentile, in the range [0, 1]. + + Rank-scaling happens independently on each feature, by determining + the percentile of the feature value. + A feature value that is smaller than observed during fitting + will scale to 0. + A feature value that is larger than observed during fitting + will scale to 1. + A feature value that is the median will scale to 0.5. + + Standardization of a dataset is a common requirement for many + machine learning estimators. Rank-scaling is useful when + estimators perform badly on StandardScalar features. Rank-scaling + is more robust than StandardScaler, because outliers can't have + large values post scaling. It is an empirical question whether + you want outliers to be given high importance (StandardScaler) + or not (RankScaler). + + Parameters + ---------- + n_ranks : int, 1000 by default + The number of different ranks possible. + i.e. The number of indices in the compressed ranking matrix + `sort_X_`. + This is an approximation, to save memory and transform + computation time. + e.g. if 1000, transformed values will have resolution 0.001. + If `None`, we store the full size matrix, comparable + in size to the initial fit `X`. + + Attributes + ---------- + `sort_X_` : array of ints, shape (n_samples, n_features) + The rank-index of every feature in the fit X. + + See also + -------- + :class:`sklearn.preprocessing.StandardScaler` to perform standardization + that is faster, but less robust to outliers. + """ + + def __init__(self, n_ranks=1000): + # TODO: Add min and max parameters? Default = [0, 1] + self.n_ranks = n_ranks + + def fit(self, X, y=None): + """Compute the feature ranks for later scaling. + + fit will take time O(n_features * n_samples * log(n_samples)), + because it must sort the entire matrix. + + It use memory O(n_features * n_ranks). + + Parameters + ---------- + X : array-like, shape (n_samples, n_features) + The data used to compute feature ranks. + """ + X = array2d(X) + n_samples, n_features = X.shape + full_sort_X_ = np.sort(X, axis=0) + if not self.n_ranks or self.n_ranks >= n_samples: + # Store the full matrix + self.sort_X_ = full_sort_X_ + else: + # Approximate the stored sort_X_ + self.sort_X_ = np.zeros((self.n_ranks, n_features)) + for i in range(self.n_ranks): + for j in range(n_features): + # Find the corresponding i in the original ranking + iorig = i * 1. * n_samples / self.n_ranks + ioriglo = int(iorig) + iorighi = ioriglo + 1 + + if ioriglo == n_samples: + self.sort_X_[i, j] = full_sort_X_[ioriglo, j] + else: + # And use linear interpolation to combine the + # original values. + wlo = (1 - (iorig - ioriglo)) + whi = (1 - (iorighi - iorig)) + assert wlo >= 0 and wlo <= 1 + assert whi >= 0 and whi <= 1 + assert_almost_equal(wlo+whi, 1.) + self.sort_X_[i, j] = wlo * full_sort_X_[ioriglo, j] \ + + whi * full_sort_X_[iorighi, j] + return self + + def transform(self, X): + """Perform rank-standardization. + + transform will take O(n_features * n_samples * log(n_ranks)), + where `n_fit_samples` is the number of samples used during `fit`. + + Parameters + ---------- + X : array-like, shape (n_samples, n_features) + The data used to scale along the features axis. + """ + X = array2d(X) + warn_if_not_float(X, estimator=self) + # TODO: Can add a copy parameter, and simply overwrite X if copy=False + X2 = np.zeros(X.shape) + for j in range(X.shape[1]): + lidx = np.searchsorted(self.sort_X_[:, j], X[:, j], side='left') + ridx = np.searchsorted(self.sort_X_[:, j], X[:, j], side='right') + v = 1. * (lidx + ridx) / (2 * self.sort_X_.shape[0]) + X2[:,j] = v + return X2 + + # TODO : Add inverse_transform method. + # I believe we could reuse the approximation code in `fit`. diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index 7a51049b60242..3df17f68dcceb 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -1,4 +1,3 @@ - # Authors: # # Giorgio Patrini @@ -48,6 +47,7 @@ from sklearn.preprocessing.data import robust_scale from sklearn.preprocessing.data import add_dummy_feature from sklearn.preprocessing.data import PolynomialFeatures +from sklearn.preprocessing.data import RankScaler from sklearn.exceptions import DataConversionWarning from sklearn.pipeline import Pipeline @@ -290,6 +290,42 @@ def test_scaler_2d_arrays(): # Check that X has not been copied assert_true(X_scaled is not X) + X = np.array([[1, 0, 0, 0, 1], + [2, 1, 4, 1, 1], + [3, 2, 3, 1, 0], + [3, 0, 0, 4, 1]]) + + rank_scaler = RankScaler() + rank_scaler.fit(X) + X_scaled = rank_scaler.transform(X) + assert_array_almost_equal(X_scaled, [[0.125, 0.25, 0.25, 0.125, 0.625], + [0.375, 0.625, 0.875, 0.5, 0.625], + [0.75, 0.875, 0.625, 0.5, 0.125], + [0.75, 0.25, 0.25, 0.875, 0.625]]) + + X2 = np.array([[0, 1.5, 0, 5, 10]]) + X2_scaled = rank_scaler.transform(X2) + assert_array_almost_equal(X2_scaled, [[0., 0.75, 0.25, 1., 1.]]) + + # Check RankScaler at different n_ranks + n_features = 100 + for n_samples in [10, 100, 1000]: + for n_ranks in [n_samples + 1, n_samples, n_samples - 1, + int(n_samples / 2), int(n_samples / 7), int(n_samples / 10)]: + X = rng.randn(n_samples, n_features) + rank_scaler1 = RankScaler(n_ranks=None) + rank_scaler2 = RankScaler(n_ranks=n_ranks) + rank_scaler1.fit(X) + rank_scaler2.fit(X) + + X2 = rng.randn(1000, n_features) + X21 = rank_scaler1.transform(X2) + X22 = rank_scaler2.transform(X2) + + # In the approximate version X22, all values must + # be within 1./n_ranks of the exact value X11. + assert_true(np.all(np.fabs(X21 - X22) < 1. / n_ranks)) + def test_handle_zeros_in_scale(): s1 = np.array([0, 1, 2, 3]) diff --git a/sklearn/tests/test_preprocessing.py b/sklearn/tests/test_preprocessing.py deleted file mode 100644 index 6c87243d1be88..0000000000000 --- a/sklearn/tests/test_preprocessing.py +++ /dev/null @@ -1,841 +0,0 @@ -import warnings -import numpy as np -import numpy.linalg as la -import scipy.sparse as sp - -from sklearn.utils.testing import assert_almost_equal -from sklearn.utils.testing import assert_array_almost_equal -from sklearn.utils.testing import assert_array_equal -from sklearn.utils.testing import assert_equal -from sklearn.utils.testing import assert_raises -from sklearn.utils.testing import assert_true -from sklearn.utils.testing import assert_false - -from sklearn.utils.sparsefuncs import mean_variance_axis0 -from sklearn.preprocessing import Binarizer -from sklearn.preprocessing import KernelCenterer -from sklearn.preprocessing import LabelBinarizer -from sklearn.preprocessing import _transform_selected -from sklearn.preprocessing import OneHotEncoder -from sklearn.preprocessing import LabelEncoder -from sklearn.preprocessing import Normalizer -from sklearn.preprocessing import normalize -from sklearn.preprocessing import StandardScaler -from sklearn.preprocessing import scale -from sklearn.preprocessing import MinMaxScaler -from sklearn.preprocessing import RankScaler -from sklearn.preprocessing import add_dummy_feature - -from sklearn import datasets -from sklearn.linear_model.stochastic_gradient import SGDClassifier - -iris = datasets.load_iris() - - -def toarray(a): - if hasattr(a, "toarray"): - a = a.toarray() - return a - - -def test_scaler_1d(): - """Test scaling of dataset along single axis""" - rng = np.random.RandomState(0) - X = rng.randn(5) - X_orig_copy = X.copy() - - scaler = StandardScaler() - X_scaled = scaler.fit(X).transform(X, copy=False) - assert_array_almost_equal(X_scaled.mean(axis=0), 0.0) - assert_array_almost_equal(X_scaled.std(axis=0), 1.0) - - # check inverse transform - X_scaled_back = scaler.inverse_transform(X_scaled) - assert_array_almost_equal(X_scaled_back, X_orig_copy) - - # Test with 1D list - X = [0., 1., 2, 0.4, 1.] - scaler = StandardScaler() - X_scaled = scaler.fit(X).transform(X, copy=False) - assert_array_almost_equal(X_scaled.mean(axis=0), 0.0) - assert_array_almost_equal(X_scaled.std(axis=0), 1.0) - - X_scaled = scale(X) - assert_array_almost_equal(X_scaled.mean(axis=0), 0.0) - assert_array_almost_equal(X_scaled.std(axis=0), 1.0) - -# rank_scaler = RankScaler() -# X_rank_scaled = rank_scaler.fit(X).transform(X) - - -def test_scaler_2d_arrays(): - """Test scaling of 2d array along first axis""" - rng = np.random.RandomState(0) - X = rng.randn(4, 5) - X[:, 0] = 0.0 # first feature is always of zero - - scaler = StandardScaler() - X_scaled = scaler.fit(X).transform(X, copy=True) - assert_false(np.any(np.isnan(X_scaled))) - - assert_array_almost_equal(X_scaled.mean(axis=0), 5 * [0.0]) - assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.]) - # Check that X has been copied - assert_true(X_scaled is not X) - - # check inverse transform - X_scaled_back = scaler.inverse_transform(X_scaled) - assert_true(X_scaled_back is not X) - assert_true(X_scaled_back is not X_scaled) - assert_array_almost_equal(X_scaled_back, X) - - X_scaled = scale(X, axis=1, with_std=False) - assert_false(np.any(np.isnan(X_scaled))) - assert_array_almost_equal(X_scaled.mean(axis=1), 4 * [0.0]) - X_scaled = scale(X, axis=1, with_std=True) - assert_false(np.any(np.isnan(X_scaled))) - assert_array_almost_equal(X_scaled.mean(axis=1), 4 * [0.0]) - assert_array_almost_equal(X_scaled.std(axis=1), 4 * [1.0]) - # Check that the data hasn't been modified - assert_true(X_scaled is not X) - - X_scaled = scaler.fit(X).transform(X, copy=False) - assert_false(np.any(np.isnan(X_scaled))) - assert_array_almost_equal(X_scaled.mean(axis=0), 5 * [0.0]) - assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.]) - # Check that X has not been copied - assert_true(X_scaled is X) - - X = rng.randn(4, 5) - X[:, 0] = 1.0 # first feature is a constant, non zero feature - scaler = StandardScaler() - X_scaled = scaler.fit(X).transform(X, copy=True) - assert_false(np.any(np.isnan(X_scaled))) - assert_array_almost_equal(X_scaled.mean(axis=0), 5 * [0.0]) - assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.]) - # Check that X has not been copied - assert_true(X_scaled is not X) - - X = np.array([[1, 0, 0, 0, 1], - [2, 1, 4, 1, 1], - [3, 2, 3, 1, 0], - [3, 0, 0, 4, 1]]) - - rank_scaler = RankScaler() - rank_scaler.fit(X) - X_scaled = rank_scaler.transform(X) - assert_array_almost_equal(X_scaled, [[0.125, 0.25, 0.25, 0.125, 0.625], - [0.375, 0.625, 0.875, 0.5, 0.625], - [0.75, 0.875, 0.625, 0.5, 0.125], - [0.75, 0.25, 0.25, 0.875, 0.625]]) - - X2 = np.array([[0, 1.5, 0, 5, 10]]) - X2_scaled = rank_scaler.transform(X2) - assert_array_almost_equal(X2_scaled, [[0., 0.75, 0.25, 1., 1.]]) - - # Check RankScaler at different n_ranks - n_features = 100 - for n_samples in [10, 100, 1000]: - for n_ranks in [n_samples + 1, n_samples, n_samples - 1, - int(n_samples / 2), int(n_samples / 7), int(n_samples / 10)]: - X = rng.randn(n_samples, n_features) - rank_scaler1 = RankScaler(n_ranks=None) - rank_scaler2 = RankScaler(n_ranks=n_ranks) - rank_scaler1.fit(X) - rank_scaler2.fit(X) - - X2 = rng.randn(1000, n_features) - X21 = rank_scaler1.transform(X2) - X22 = rank_scaler2.transform(X2) - - # In the approximate version X22, all values must - # be within 1./n_ranks of the exact value X11. - assert_true(np.all(np.fabs(X21 - X22) < 1. / n_ranks)) - - -def test_min_max_scaler_iris(): - X = iris.data - scaler = MinMaxScaler() - # default params - X_trans = scaler.fit_transform(X) - assert_array_almost_equal(X_trans.min(axis=0), 0) - assert_array_almost_equal(X_trans.min(axis=0), 0) - assert_array_almost_equal(X_trans.max(axis=0), 1) - X_trans_inv = scaler.inverse_transform(X_trans) - assert_array_almost_equal(X, X_trans_inv) - - # not default params: min=1, max=2 - scaler = MinMaxScaler(feature_range=(1, 2)) - X_trans = scaler.fit_transform(X) - assert_array_almost_equal(X_trans.min(axis=0), 1) - assert_array_almost_equal(X_trans.max(axis=0), 2) - X_trans_inv = scaler.inverse_transform(X_trans) - assert_array_almost_equal(X, X_trans_inv) - - # min=-.5, max=.6 - scaler = MinMaxScaler(feature_range=(-.5, .6)) - X_trans = scaler.fit_transform(X) - assert_array_almost_equal(X_trans.min(axis=0), -.5) - assert_array_almost_equal(X_trans.max(axis=0), .6) - X_trans_inv = scaler.inverse_transform(X_trans) - assert_array_almost_equal(X, X_trans_inv) - - # raises on invalid range - scaler = MinMaxScaler(feature_range=(2, 1)) - assert_raises(ValueError, scaler.fit, X) - - -def test_min_max_scaler_zero_variance_features(): - """Check min max scaler on toy data with zero variance features""" - X = [[0., 1., 0.5], - [0., 1., -0.1], - [0., 1., 1.1]] - - X_new = [[+0., 2., 0.5], - [-1., 1., 0.0], - [+0., 1., 1.5]] - - # default params - scaler = MinMaxScaler() - X_trans = scaler.fit_transform(X) - X_expected_0_1 = [[0., 0., 0.5], - [0., 0., 0.0], - [0., 0., 1.0]] - assert_array_almost_equal(X_trans, X_expected_0_1) - X_trans_inv = scaler.inverse_transform(X_trans) - assert_array_almost_equal(X, X_trans_inv) - - X_trans_new = scaler.transform(X_new) - X_expected_0_1_new = [[+0., 1., 0.500], - [-1., 0., 0.083], - [+0., 0., 1.333]] - assert_array_almost_equal(X_trans_new, X_expected_0_1_new, decimal=2) - - # not default params - scaler = MinMaxScaler(feature_range=(1, 2)) - X_trans = scaler.fit_transform(X) - X_expected_1_2 = [[1., 1., 1.5], - [1., 1., 1.0], - [1., 1., 2.0]] - assert_array_almost_equal(X_trans, X_expected_1_2) - - -def test_scaler_without_centering(): - rng = np.random.RandomState(42) - X = rng.randn(4, 5) - X[:, 0] = 0.0 # first feature is always of zero - X_csr = sp.csr_matrix(X) - X_csc = sp.csc_matrix(X) - - assert_raises(ValueError, StandardScaler().fit, X_csr) - - null_transform = StandardScaler(with_mean=False, with_std=False, copy=True) - X_null = null_transform.fit_transform(X_csr) - assert_array_equal(X_null.data, X_csr.data) - X_orig = null_transform.inverse_transform(X_null) - assert_array_equal(X_orig.data, X_csr.data) - - scaler = StandardScaler(with_mean=False).fit(X) - X_scaled = scaler.transform(X, copy=True) - assert_false(np.any(np.isnan(X_scaled))) - - scaler_csr = StandardScaler(with_mean=False).fit(X_csr) - X_csr_scaled = scaler_csr.transform(X_csr, copy=True) - assert_false(np.any(np.isnan(X_csr_scaled.data))) - - scaler_csc = StandardScaler(with_mean=False).fit(X_csc) - X_csc_scaled = scaler_csr.transform(X_csc, copy=True) - assert_false(np.any(np.isnan(X_csc_scaled.data))) - - assert_equal(scaler.mean_, scaler_csr.mean_) - assert_array_almost_equal(scaler.std_, scaler_csr.std_) - - assert_equal(scaler.mean_, scaler_csc.mean_) - assert_array_almost_equal(scaler.std_, scaler_csc.std_) - - assert_array_almost_equal( - X_scaled.mean(axis=0), [0., -0.01, 2.24, -0.35, -0.78], 2) - assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.]) - - X_csr_scaled_mean, X_csr_scaled_std = mean_variance_axis0(X_csr_scaled) - assert_array_almost_equal(X_csr_scaled_mean, X_scaled.mean(axis=0)) - assert_array_almost_equal(X_csr_scaled_std, X_scaled.std(axis=0)) - - # Check that X has not been modified (copy) - assert_true(X_scaled is not X) - assert_true(X_csr_scaled is not X_csr) - - X_scaled_back = scaler.inverse_transform(X_scaled) - assert_true(X_scaled_back is not X) - assert_true(X_scaled_back is not X_scaled) - assert_array_almost_equal(X_scaled_back, X) - - X_csr_scaled_back = scaler_csr.inverse_transform(X_csr_scaled) - assert_true(X_csr_scaled_back is not X_csr) - assert_true(X_csr_scaled_back is not X_csr_scaled) - assert_array_almost_equal(X_csr_scaled_back.toarray(), X) - - X_csc_scaled_back = scaler_csr.inverse_transform(X_csc_scaled.tocsc()) - assert_true(X_csc_scaled_back is not X_csc) - assert_true(X_csc_scaled_back is not X_csc_scaled) - assert_array_almost_equal(X_csc_scaled_back.toarray(), X) - - -def test_scaler_without_copy(): - """Check that StandardScaler.fit does not change input""" - rng = np.random.RandomState(42) - X = rng.randn(4, 5) - X[:, 0] = 0.0 # first feature is always of zero - X_csr = sp.csr_matrix(X) - - X_copy = X.copy() - StandardScaler(copy=False).fit(X) - assert_array_equal(X, X_copy) - - X_csr_copy = X_csr.copy() - StandardScaler(with_mean=False, copy=False).fit(X_csr) - assert_array_equal(X_csr.toarray(), X_csr_copy.toarray()) - - -def test_scale_sparse_with_mean_raise_exception(): - rng = np.random.RandomState(42) - X = rng.randn(4, 5) - X_csr = sp.csr_matrix(X) - - # check scaling and fit with direct calls on sparse data - assert_raises(ValueError, scale, X_csr, with_mean=True) - assert_raises(ValueError, StandardScaler(with_mean=True).fit, X_csr) - - # check transform and inverse_transform after a fit on a dense array - scaler = StandardScaler(with_mean=True).fit(X) - assert_raises(ValueError, scaler.transform, X_csr) - - X_transformed_csr = sp.csr_matrix(scaler.transform(X)) - assert_raises(ValueError, scaler.inverse_transform, X_transformed_csr) - - -def test_scale_function_without_centering(): - rng = np.random.RandomState(42) - X = rng.randn(4, 5) - X[:, 0] = 0.0 # first feature is always of zero - X_csr = sp.csr_matrix(X) - - X_scaled = scale(X, with_mean=False) - assert_false(np.any(np.isnan(X_scaled))) - - X_csr_scaled = scale(X_csr, with_mean=False) - assert_false(np.any(np.isnan(X_csr_scaled.data))) - - # test csc has same outcome - X_csc_scaled = scale(X_csr.tocsc(), with_mean=False) - assert_array_almost_equal(X_scaled, X_csc_scaled.toarray()) - - # raises value error on axis != 0 - assert_raises(ValueError, scale, X_csr, with_mean=False, axis=1) - - assert_array_almost_equal(X_scaled.mean(axis=0), - [0., -0.01, 2.24, -0.35, -0.78], 2) - assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.]) - # Check that X has not been copied - assert_true(X_scaled is not X) - - X_csr_scaled_mean, X_csr_scaled_std = mean_variance_axis0(X_csr_scaled) - assert_array_almost_equal(X_csr_scaled_mean, X_scaled.mean(axis=0)) - assert_array_almost_equal(X_csr_scaled_std, X_scaled.std(axis=0)) - - -def test_warning_scaling_integers(): - """Check warning when scaling integer data""" - X = np.array([[1, 2, 0], - [0, 0, 0]], dtype=np.uint8) - - with warnings.catch_warnings(record=True) as w: - StandardScaler().fit(X) - assert_equal(len(w), 1) - - with warnings.catch_warnings(record=True) as w: - MinMaxScaler().fit(X) - assert_equal(len(w), 1) - - -def test_normalizer_l1(): - rng = np.random.RandomState(0) - X_dense = rng.randn(4, 5) - X_sparse_unpruned = sp.csr_matrix(X_dense) - - # set the row number 3 to zero - X_dense[3, :] = 0.0 - - # set the row number 3 to zero without pruning (can happen in real life) - indptr_3 = X_sparse_unpruned.indptr[3] - indptr_4 = X_sparse_unpruned.indptr[4] - X_sparse_unpruned.data[indptr_3:indptr_4] = 0.0 - - # build the pruned variant using the regular constructor - X_sparse_pruned = sp.csr_matrix(X_dense) - - # check inputs that support the no-copy optim - for X in (X_dense, X_sparse_pruned, X_sparse_unpruned): - - normalizer = Normalizer(norm='l1', copy=True) - X_norm = normalizer.transform(X) - assert_true(X_norm is not X) - X_norm1 = toarray(X_norm) - - normalizer = Normalizer(norm='l1', copy=False) - X_norm = normalizer.transform(X) - assert_true(X_norm is X) - X_norm2 = toarray(X_norm) - - for X_norm in (X_norm1, X_norm2): - row_sums = np.abs(X_norm).sum(axis=1) - for i in range(3): - assert_almost_equal(row_sums[i], 1.0) - assert_almost_equal(row_sums[3], 0.0) - - # check input for which copy=False won't prevent a copy - for init in (sp.coo_matrix, sp.csc_matrix, sp.lil_matrix): - X = init(X_dense) - X_norm = normalizer = Normalizer(norm='l2', copy=False).transform(X) - - assert_true(X_norm is not X) - assert_true(isinstance(X_norm, sp.csr_matrix)) - - X_norm = toarray(X_norm) - for i in range(3): - assert_almost_equal(row_sums[i], 1.0) - assert_almost_equal(la.norm(X_norm[3]), 0.0) - - -def test_normalizer_l2(): - rng = np.random.RandomState(0) - X_dense = rng.randn(4, 5) - X_sparse_unpruned = sp.csr_matrix(X_dense) - - # set the row number 3 to zero - X_dense[3, :] = 0.0 - - # set the row number 3 to zero without pruning (can happen in real life) - indptr_3 = X_sparse_unpruned.indptr[3] - indptr_4 = X_sparse_unpruned.indptr[4] - X_sparse_unpruned.data[indptr_3:indptr_4] = 0.0 - - # build the pruned variant using the regular constructor - X_sparse_pruned = sp.csr_matrix(X_dense) - - # check inputs that support the no-copy optim - for X in (X_dense, X_sparse_pruned, X_sparse_unpruned): - - normalizer = Normalizer(norm='l2', copy=True) - X_norm1 = normalizer.transform(X) - assert_true(X_norm1 is not X) - X_norm1 = toarray(X_norm1) - - normalizer = Normalizer(norm='l2', copy=False) - X_norm2 = normalizer.transform(X) - assert_true(X_norm2 is X) - X_norm2 = toarray(X_norm2) - - for X_norm in (X_norm1, X_norm2): - for i in range(3): - assert_almost_equal(la.norm(X_norm[i]), 1.0) - assert_almost_equal(la.norm(X_norm[3]), 0.0) - - # check input for which copy=False won't prevent a copy - for init in (sp.coo_matrix, sp.csc_matrix, sp.lil_matrix): - X = init(X_dense) - X_norm = normalizer = Normalizer(norm='l2', copy=False).transform(X) - - assert_true(X_norm is not X) - assert_true(isinstance(X_norm, sp.csr_matrix)) - - X_norm = toarray(X_norm) - for i in range(3): - assert_almost_equal(la.norm(X_norm[i]), 1.0) - assert_almost_equal(la.norm(X_norm[3]), 0.0) - - -def test_normalize_errors(): - """Check that invalid arguments yield ValueError""" - assert_raises(ValueError, normalize, [[0]], axis=2) - assert_raises(ValueError, normalize, [[0]], norm='l3') - - -def test_binarizer(): - X_ = np.array([[1, 0, 5], [2, 3, -1]]) - - for init in (np.array, list, sp.csr_matrix, sp.csc_matrix): - - X = init(X_.copy()) - - binarizer = Binarizer(threshold=2.0, copy=True) - X_bin = toarray(binarizer.transform(X)) - assert_equal(np.sum(X_bin == 0), 4) - assert_equal(np.sum(X_bin == 1), 2) - X_bin = binarizer.transform(X) - assert_equal(sp.issparse(X), sp.issparse(X_bin)) - - binarizer = Binarizer(copy=True).fit(X) - X_bin = toarray(binarizer.transform(X)) - assert_true(X_bin is not X) - assert_equal(np.sum(X_bin == 0), 2) - assert_equal(np.sum(X_bin == 1), 4) - - binarizer = Binarizer(copy=True) - X_bin = binarizer.transform(X) - assert_true(X_bin is not X) - X_bin = toarray(X_bin) - assert_equal(np.sum(X_bin == 0), 2) - assert_equal(np.sum(X_bin == 1), 4) - - binarizer = Binarizer(copy=False) - X_bin = binarizer.transform(X) - if init is not list: - assert_true(X_bin is X) - X_bin = toarray(X_bin) - assert_equal(np.sum(X_bin == 0), 2) - assert_equal(np.sum(X_bin == 1), 4) - - binarizer = Binarizer(threshold=-0.5, copy=True) - for init in (np.array, list): - X = init(X_.copy()) - - X_bin = toarray(binarizer.transform(X)) - assert_equal(np.sum(X_bin == 0), 1) - assert_equal(np.sum(X_bin == 1), 5) - X_bin = binarizer.transform(X) - - # Cannot use threshold < 0 for sparse - assert_raises(ValueError, binarizer.transform, sp.csc_matrix(X)) - - -def test_label_binarizer(): - lb = LabelBinarizer() - - # two-class case - inp = ["neg", "pos", "pos", "neg"] - expected = np.array([[0, 1, 1, 0]]).T - got = lb.fit_transform(inp) - assert_array_equal(expected, got) - assert_array_equal(lb.inverse_transform(got), inp) - - # multi-class case - inp = ["spam", "ham", "eggs", "ham", "0"] - expected = np.array([[0, 0, 0, 1], - [0, 0, 1, 0], - [0, 1, 0, 0], - [0, 0, 1, 0], - [1, 0, 0, 0]]) - got = lb.fit_transform(inp) - assert_array_equal(expected, got) - assert_array_equal(lb.inverse_transform(got), inp) - - -def test_label_binarizer_set_label_encoding(): - lb = LabelBinarizer(neg_label=-2, pos_label=2) - - # two-class case - inp = np.array([0, 1, 1, 0]) - expected = np.array([[-2, 2, 2, -2]]).T - got = lb.fit_transform(inp) - assert_array_equal(expected, got) - assert_array_equal(lb.inverse_transform(got), inp) - - # multi-class case - inp = np.array([3, 2, 1, 2, 0]) - expected = np.array([[-2, -2, -2, +2], - [-2, -2, +2, -2], - [-2, +2, -2, -2], - [-2, -2, +2, -2], - [+2, -2, -2, -2]]) - got = lb.fit_transform(inp) - assert_array_equal(expected, got) - assert_array_equal(lb.inverse_transform(got), inp) - - -def test_label_binarizer_multilabel(): - lb = LabelBinarizer() - - # test input as lists of tuples - inp = [(2, 3), (1,), (1, 2)] - indicator_mat = np.array([[0, 1, 1], - [1, 0, 0], - [1, 1, 0]]) - got = lb.fit_transform(inp) - assert_array_equal(indicator_mat, got) - assert_equal(lb.inverse_transform(got), inp) - - # test input as label indicator matrix - lb.fit(indicator_mat) - assert_array_equal(indicator_mat, - lb.inverse_transform(indicator_mat)) - - # regression test for the two-class multilabel case - lb = LabelBinarizer() - - inp = [[1, 0], [0], [1], [0, 1]] - expected = np.array([[1, 1], - [1, 0], - [0, 1], - [1, 1]]) - got = lb.fit_transform(inp) - assert_array_equal(expected, got) - assert_equal([set(x) for x in lb.inverse_transform(got)], - [set(x) for x in inp]) - - -def test_label_binarizer_errors(): - """Check that invalid arguments yield ValueError""" - one_class = np.array([0, 0, 0, 0]) - lb = LabelBinarizer().fit(one_class) - - multi_label = [(2, 3), (0,), (0, 2)] - assert_raises(ValueError, lb.transform, multi_label) - - lb = LabelBinarizer() - assert_raises(ValueError, lb.transform, []) - assert_raises(ValueError, lb.inverse_transform, []) - - assert_raises(ValueError, LabelBinarizer, neg_label=2, pos_label=1) - assert_raises(ValueError, LabelBinarizer, neg_label=2, pos_label=2) - - -def test_one_hot_encoder(): - """Test OneHotEncoder's fit and transform.""" - X = [[3, 2, 1], [0, 1, 1]] - enc = OneHotEncoder() - # discover max values automatically - X_trans = enc.fit_transform(X).toarray() - assert_equal(X_trans.shape, (2, 5)) - assert_array_equal(enc.active_features_, - np.where([1, 0, 0, 1, 0, 1, 1, 0, 1])[0]) - assert_array_equal(enc.feature_indices_, [0, 4, 7, 9]) - - # check outcome - assert_array_equal(X_trans, - [[0., 1., 0., 1., 1.], - [1., 0., 1., 0., 1.]]) - - # max value given as 3 - enc = OneHotEncoder(n_values=4) - X_trans = enc.fit_transform(X) - assert_equal(X_trans.shape, (2, 4 * 3)) - assert_array_equal(enc.feature_indices_, [0, 4, 8, 12]) - - # max value given per feature - enc = OneHotEncoder(n_values=[3, 2, 2]) - X = [[1, 0, 1], [0, 1, 1]] - X_trans = enc.fit_transform(X) - assert_equal(X_trans.shape, (2, 3 + 2 + 2)) - assert_array_equal(enc.n_values_, [3, 2, 2]) - # check that testing with larger feature works: - X = np.array([[2, 0, 1], [0, 1, 1]]) - enc.transform(X) - - # test that an error is raise when out of bounds: - X_too_large = [[0, 2, 1], [0, 1, 1]] - assert_raises(ValueError, enc.transform, X_too_large) - - # test that error is raised when wrong number of features - assert_raises(ValueError, enc.transform, X[:, :-1]) - # test that error is raised when wrong number of features in fit - # with prespecified n_values - assert_raises(ValueError, enc.fit, X[:, :-1]) - # test exception on wrong init param - assert_raises(TypeError, OneHotEncoder(n_values=np.int).fit, X) - - enc = OneHotEncoder() - # test negative input to fit - assert_raises(ValueError, enc.fit, [[0], [-1]]) - - # test negative input to transform - enc.fit([[0], [1]]) - assert_raises(ValueError, enc.transform, [[0], [-1]]) - - -def _check_transform_selected(X, X_expected, sel): - for M in (X, sp.csr_matrix(X)): - Xtr = _transform_selected(M, Binarizer().transform, sel) - assert_array_equal(toarray(Xtr), X_expected) - - -def test_transform_selected(): - X = [[3, 2, 1], [0, 1, 1]] - - X_expected = [[1, 2, 1], [0, 1, 1]] - _check_transform_selected(X, X_expected, [0]) - _check_transform_selected(X, X_expected, [True, False, False]) - - X_expected = [[1, 1, 1], [0, 1, 1]] - _check_transform_selected(X, X_expected, [0, 1, 2]) - _check_transform_selected(X, X_expected, [True, True, True]) - _check_transform_selected(X, X_expected, "all") - - _check_transform_selected(X, X, []) - _check_transform_selected(X, X, [False, False, False]) - - -def _run_one_hot(X, X2, cat): - enc = OneHotEncoder(categorical_features=cat) - Xtr = enc.fit_transform(X) - X2tr = enc.transform(X2) - return Xtr, X2tr - - -def _check_one_hot(X, X2, cat, n_features): - ind = np.where(cat)[0] - # With mask - A, B = _run_one_hot(X, X2, cat) - # With indices - C, D = _run_one_hot(X, X2, ind) - # Check shape - assert_equal(A.shape, (2, n_features)) - assert_equal(B.shape, (1, n_features)) - assert_equal(C.shape, (2, n_features)) - assert_equal(D.shape, (1, n_features)) - # Check that mask and indices give the same results - assert_array_equal(toarray(A), toarray(C)) - assert_array_equal(toarray(B), toarray(D)) - - -def test_one_hot_encoder_categorical_features(): - X = np.array([[3, 2, 1], [0, 1, 1]]) - X2 = np.array([[1, 1, 1]]) - - cat = [True, False, False] - _check_one_hot(X, X2, cat, 4) - - # Edge case: all non-categorical - cat = [False, False, False] - _check_one_hot(X, X2, cat, 3) - - # Edge case: all categorical - cat = [True, True, True] - _check_one_hot(X, X2, cat, 5) - - -def test_label_encoder(): - """Test LabelEncoder's transform and inverse_transform methods""" - le = LabelEncoder() - le.fit([1, 1, 4, 5, -1, 0]) - assert_array_equal(le.classes_, [-1, 0, 1, 4, 5]) - assert_array_equal(le.transform([0, 1, 4, 4, 5, -1, -1]), - [1, 2, 3, 3, 4, 0, 0]) - assert_array_equal(le.inverse_transform([1, 2, 3, 3, 4, 0, 0]), - [0, 1, 4, 4, 5, -1, -1]) - assert_raises(ValueError, le.transform, [0, 6]) - - -def test_label_encoder_fit_transform(): - """Test fit_transform""" - le = LabelEncoder() - ret = le.fit_transform([1, 1, 4, 5, -1, 0]) - assert_array_equal(ret, [2, 2, 3, 4, 0, 1]) - - le = LabelEncoder() - ret = le.fit_transform(["paris", "paris", "tokyo", "amsterdam"]) - assert_array_equal(ret, [1, 1, 2, 0]) - - -def test_label_encoder_string_labels(): - """Test LabelEncoder's transform and inverse_transform methods with - non-numeric labels""" - le = LabelEncoder() - le.fit(["paris", "paris", "tokyo", "amsterdam"]) - assert_array_equal(le.classes_, ["amsterdam", "paris", "tokyo"]) - assert_array_equal(le.transform(["tokyo", "tokyo", "paris"]), - [2, 2, 1]) - assert_array_equal(le.inverse_transform([2, 2, 1]), - ["tokyo", "tokyo", "paris"]) - assert_raises(ValueError, le.transform, ["london"]) - - -def test_label_encoder_errors(): - """Check that invalid arguments yield ValueError""" - le = LabelEncoder() - assert_raises(ValueError, le.transform, []) - assert_raises(ValueError, le.inverse_transform, []) - - -def test_label_binarizer_iris(): - lb = LabelBinarizer() - Y = lb.fit_transform(iris.target) - clfs = [SGDClassifier().fit(iris.data, Y[:, k]) - for k in range(len(lb.classes_))] - Y_pred = np.array([clf.decision_function(iris.data) for clf in clfs]).T - y_pred = lb.inverse_transform(Y_pred) - accuracy = np.mean(iris.target == y_pred) - y_pred2 = SGDClassifier().fit(iris.data, iris.target).predict(iris.data) - accuracy2 = np.mean(iris.target == y_pred2) - assert_almost_equal(accuracy, accuracy2) - - -def test_label_binarizer_multilabel_unlabeled(): - """Check that LabelBinarizer can handle an unlabeled sample""" - lb = LabelBinarizer() - y = [[1, 2], [1], []] - Y = np.array([[1, 1], - [1, 0], - [0, 0]]) - assert_array_equal(lb.fit_transform(y), Y) - - -def test_center_kernel(): - """Test that KernelCenterer is equivalent to StandardScaler - in feature space""" - rng = np.random.RandomState(0) - X_fit = rng.random_sample((5, 4)) - scaler = StandardScaler(with_std=False) - scaler.fit(X_fit) - X_fit_centered = scaler.transform(X_fit) - K_fit = np.dot(X_fit, X_fit.T) - - # center fit time matrix - centerer = KernelCenterer() - K_fit_centered = np.dot(X_fit_centered, X_fit_centered.T) - K_fit_centered2 = centerer.fit_transform(K_fit) - assert_array_almost_equal(K_fit_centered, K_fit_centered2) - - # center predict time matrix - X_pred = rng.random_sample((2, 4)) - K_pred = np.dot(X_pred, X_fit.T) - X_pred_centered = scaler.transform(X_pred) - K_pred_centered = np.dot(X_pred_centered, X_fit_centered.T) - K_pred_centered2 = centerer.transform(K_pred) - assert_array_almost_equal(K_pred_centered, K_pred_centered2) - - -def test_fit_transform(): - rng = np.random.RandomState(0) - X = rng.random_sample((5, 4)) - for obj in ((StandardScaler(), Normalizer(), Binarizer())): - X_transformed = obj.fit(X).transform(X) - X_transformed2 = obj.fit_transform(X) - assert_array_equal(X_transformed, X_transformed2) - - -def test_add_dummy_feature(): - X = [[1, 0], [0, 1], [0, 1]] - X = add_dummy_feature(X) - assert_array_equal(X, [[1, 1, 0], [1, 0, 1], [1, 0, 1]]) - - -def test_add_dummy_feature_coo(): - X = sp.coo_matrix([[1, 0], [0, 1], [0, 1]]) - X = add_dummy_feature(X) - assert_true(sp.isspmatrix_coo(X), X) - assert_array_equal(X.toarray(), [[1, 1, 0], [1, 0, 1], [1, 0, 1]]) - - -def test_add_dummy_feature_csc(): - X = sp.csc_matrix([[1, 0], [0, 1], [0, 1]]) - X = add_dummy_feature(X) - assert_true(sp.isspmatrix_csc(X), X) - assert_array_equal(X.toarray(), [[1, 1, 0], [1, 0, 1], [1, 0, 1]]) - - -def test_add_dummy_feature_csr(): - X = sp.csr_matrix([[1, 0], [0, 1], [0, 1]]) - X = add_dummy_feature(X) - assert_true(sp.isspmatrix_csr(X), X) - assert_array_equal(X.toarray(), [[1, 1, 0], [1, 0, 1], [1, 0, 1]]) From b78b6898a2d7d2c99e77f257fd2fc3881116c30d Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 15 Feb 2017 17:17:59 +0100 Subject: [PATCH 003/106] first draft --- sklearn/preprocessing/__init__.py | 2 + sklearn/preprocessing/data.py | 102 ++++++++++++++---------------- 2 files changed, 49 insertions(+), 55 deletions(-) diff --git a/sklearn/preprocessing/__init__.py b/sklearn/preprocessing/__init__.py index cabbd469c10d4..43b8abf3ab03c 100644 --- a/sklearn/preprocessing/__init__.py +++ b/sklearn/preprocessing/__init__.py @@ -20,6 +20,7 @@ from .data import maxabs_scale from .data import minmax_scale from .data import OneHotEncoder +from .data import QuantileNormalizer from .data import PolynomialFeatures @@ -41,6 +42,7 @@ 'MultiLabelBinarizer', 'MinMaxScaler', 'MaxAbsScaler', + 'QuantileNormalizer', 'Normalizer', 'OneHotEncoder', 'RobustScaler', diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 519f1233e0c18..783b71a484818 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -13,6 +13,7 @@ import numpy as np from scipy import sparse +from scipy.interpolate import interp1d from ..base import BaseEstimator, TransformerMixin from ..externals import six @@ -25,7 +26,8 @@ from ..utils.sparsefuncs import (inplace_column_scale, mean_variance_axis, incr_mean_variance_axis, min_max_axis) -from ..utils.validation import check_is_fitted, FLOAT_DTYPES +from ..utils.validation import (check_is_fitted, check_random_state, + FLOAT_DTYPES) zip = six.moves.zip @@ -41,6 +43,7 @@ 'OneHotEncoder', 'RobustScaler', 'StandardScaler', + 'QuantileNormalizer', 'add_dummy_feature', 'binarize', 'normalize', @@ -1903,7 +1906,7 @@ def transform(self, X): self.categorical_features, copy=True) -class RankScaler(BaseEstimator, TransformerMixin): +class QuantileNormalizer(BaseEstimator, TransformerMixin): """Rank-standardize features to a percentile, in the range [0, 1]. Rank-scaling happens independently on each feature, by determining @@ -1945,74 +1948,63 @@ class RankScaler(BaseEstimator, TransformerMixin): that is faster, but less robust to outliers. """ - def __init__(self, n_ranks=1000): - # TODO: Add min and max parameters? Default = [0, 1] - self.n_ranks = n_ranks + def __init__(self, n_quantiles=1000, subsample=int(1e5), + random_state=None): + self.n_quantiles = n_quantiles + self.subsample = subsample + self.random_state = random_state def fit(self, X, y=None): - """Compute the feature ranks for later scaling. - - fit will take time O(n_features * n_samples * log(n_samples)), - because it must sort the entire matrix. - - It use memory O(n_features * n_ranks). + """Compute the quantiles used for normalizing. Parameters ---------- X : array-like, shape (n_samples, n_features) - The data used to compute feature ranks. + The data used to compute the quantiles. """ - X = array2d(X) - n_samples, n_features = X.shape - full_sort_X_ = np.sort(X, axis=0) - if not self.n_ranks or self.n_ranks >= n_samples: - # Store the full matrix - self.sort_X_ = full_sort_X_ + X = check_array(X) + rng = check_random_state(self.random_state) + + # subsample the matrix X if necessary + if self.subsample < X.shape[0]: + subsample_idx = rng.choice(X.shape[0], self.subsample, + replace=False) else: - # Approximate the stored sort_X_ - self.sort_X_ = np.zeros((self.n_ranks, n_features)) - for i in range(self.n_ranks): - for j in range(n_features): - # Find the corresponding i in the original ranking - iorig = i * 1. * n_samples / self.n_ranks - ioriglo = int(iorig) - iorighi = ioriglo + 1 - - if ioriglo == n_samples: - self.sort_X_[i, j] = full_sort_X_[ioriglo, j] - else: - # And use linear interpolation to combine the - # original values. - wlo = (1 - (iorig - ioriglo)) - whi = (1 - (iorighi - iorig)) - assert wlo >= 0 and wlo <= 1 - assert whi >= 0 and whi <= 1 - assert_almost_equal(wlo+whi, 1.) - self.sort_X_[i, j] = wlo * full_sort_X_[ioriglo, j] \ - + whi * full_sort_X_[iorighi, j] + subsample_idx = np.range(X.shape[0]) + + self.references_ = np.linspace(0., 100., self.n_quantiles, + endpoint=True) + self.quantiles_ = np.percentile(X[subsample_idx], self.references_, + axis=0) + return self def transform(self, X): - """Perform rank-standardization. + """Feature-wise normalization of the data. - transform will take O(n_features * n_samples * log(n_ranks)), - where `n_fit_samples` is the number of samples used during `fit`. + Parameters + ---------- + X : array-like, shape (n_samples, n_features) + The data used to scale along the features axis. + """ + X = check_array(X) + + Xt = X.copy() + for feat_idx, quantiles_feat in enumerate(self.quantiles_.T): + mapping_func = interp1d(self.references_, self.quantiles_feat) + Xt[:, feat_idx] = mapping_func(Xt[:, feat_idx]) + + def inverse_transform(self, X): + """Back-projection to the original space. Parameters ---------- X : array-like, shape (n_samples, n_features) The data used to scale along the features axis. """ - X = array2d(X) - warn_if_not_float(X, estimator=self) - # TODO: Can add a copy parameter, and simply overwrite X if copy=False - X2 = np.zeros(X.shape) - for j in range(X.shape[1]): - lidx = np.searchsorted(self.sort_X_[:, j], X[:, j], side='left') - ridx = np.searchsorted(self.sort_X_[:, j], X[:, j], side='right') - v = 1. * (lidx + ridx) / (2 * self.sort_X_.shape[0]) - X2[:,j] = v - return X2 - - # TODO : Add inverse_transform method. - # I believe we could reuse the approximation code in `fit`. + X = check_array(X) + + Xt = X.copy() + for feat_idx, quantiles_feat in enumerate(self.quantiles_.T): + mapping_func = interp1d(self.references_, self.quantiles_feat) + Xt[:, feat_idx] = mapping_func(Xt[:, feat_idx]) From bb1829a381bc2ed13c71e96d02513f59bb815dcb Mon Sep 17 00:00:00 2001 From: Thierry Guillemot Date: Wed, 15 Feb 2017 17:18:33 +0100 Subject: [PATCH 004/106] Add tests. --- sklearn/preprocessing/tests/test_data.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index 3df17f68dcceb..f28734cff5ebd 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -887,6 +887,17 @@ def test_robust_scaler_iris_quantiles(): assert_array_almost_equal(q_range, 1) +def test_robust_scaler_iris(): + X = iris.data + normalizer = QuantileNormalizer() + X_trans = normalizer.fit_transform(X) + assert_array_almost_equal(np.min(X_trans, axis=0), 0) + assert_array_almost_equal(np.max(X_trans, axis=0), 1) + + X_trans_inv = normalizer.inverse_transform(X_trans) + assert_array_almost_equal(X, X_trans_inv) + + def test_robust_scaler_invalid_range(): for range_ in [ (-1, 90), From 5c9bcbc5b543e34adf2ebe861bea80de62a8e863 Mon Sep 17 00:00:00 2001 From: Thierry Guillemot Date: Wed, 15 Feb 2017 17:44:07 +0100 Subject: [PATCH 005/106] Fix bug in QuantileNormalizer. --- sklearn/preprocessing/data.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 783b71a484818..16d5c50624dbd 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -1970,13 +1970,15 @@ def fit(self, X, y=None): subsample_idx = rng.choice(X.shape[0], self.subsample, replace=False) else: - subsample_idx = np.range(X.shape[0]) + subsample_idx = range(X.shape[0]) self.references_ = np.linspace(0., 100., self.n_quantiles, endpoint=True) self.quantiles_ = np.percentile(X[subsample_idx], self.references_, axis=0) + self.references_ /= 100. + return self def transform(self, X): @@ -1991,9 +1993,11 @@ def transform(self, X): Xt = X.copy() for feat_idx, quantiles_feat in enumerate(self.quantiles_.T): - mapping_func = interp1d(self.references_, self.quantiles_feat) + mapping_func = interp1d(quantiles_feat, self.references_) Xt[:, feat_idx] = mapping_func(Xt[:, feat_idx]) + return Xt + def inverse_transform(self, X): """Back-projection to the original space. @@ -2006,5 +2010,7 @@ def inverse_transform(self, X): Xt = X.copy() for feat_idx, quantiles_feat in enumerate(self.quantiles_.T): - mapping_func = interp1d(self.references_, self.quantiles_feat) + mapping_func = interp1d(self.references_, quantiles_feat) Xt[:, feat_idx] = mapping_func(Xt[:, feat_idx]) + + return Xt \ No newline at end of file From 8d4b9cc0a69cab500aae8deb0ee3c0ba9c4dab6b Mon Sep 17 00:00:00 2001 From: Thierry Guillemot Date: Wed, 15 Feb 2017 17:57:50 +0100 Subject: [PATCH 006/106] Add quantile_normalizer. --- sklearn/preprocessing/__init__.py | 3 ++- sklearn/preprocessing/data.py | 12 +++++++++++- sklearn/preprocessing/tests/test_data.py | 8 +++++--- 3 files changed, 18 insertions(+), 5 deletions(-) diff --git a/sklearn/preprocessing/__init__.py b/sklearn/preprocessing/__init__.py index 43b8abf3ab03c..6ef4dc0c2ba62 100644 --- a/sklearn/preprocessing/__init__.py +++ b/sklearn/preprocessing/__init__.py @@ -12,6 +12,7 @@ from .data import Normalizer from .data import RobustScaler from .data import StandardScaler +from .data import QuantileNormalizer from .data import add_dummy_feature from .data import binarize from .data import normalize @@ -19,8 +20,8 @@ from .data import robust_scale from .data import maxabs_scale from .data import minmax_scale +from .data import quantile_normalize from .data import OneHotEncoder -from .data import QuantileNormalizer from .data import PolynomialFeatures diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 16d5c50624dbd..56a3cb700e1e4 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -2013,4 +2013,14 @@ def inverse_transform(self, X): mapping_func = interp1d(self.references_, quantiles_feat) Xt[:, feat_idx] = mapping_func(Xt[:, feat_idx]) - return Xt \ No newline at end of file + return Xt + + +def quantile_normalize(X, axis=0, n_quantiles=1000, subsample=int(1e5), + random_state=None): + n = QuantileNormalizer(n_quantiles=n_quantiles, subsample=subsample, + random_state=random_state) + if axis == 0: + return n.fit_transform(X) + else: + return n.fit_transform(X.T).T diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index f28734cff5ebd..e1574054e745f 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -41,6 +41,7 @@ from sklearn.preprocessing.data import scale from sklearn.preprocessing.data import MinMaxScaler from sklearn.preprocessing.data import minmax_scale +from sklearn.preprocessing.data import QuantileNormalizer from sklearn.preprocessing.data import MaxAbsScaler from sklearn.preprocessing.data import maxabs_scale from sklearn.preprocessing.data import RobustScaler @@ -887,12 +888,13 @@ def test_robust_scaler_iris_quantiles(): assert_array_almost_equal(q_range, 1) -def test_robust_scaler_iris(): +def test_quantile_normalizer_iris(): X = iris.data normalizer = QuantileNormalizer() + X_trans = normalizer.fit_transform(X) - assert_array_almost_equal(np.min(X_trans, axis=0), 0) - assert_array_almost_equal(np.max(X_trans, axis=0), 1) + assert_array_almost_equal(np.min(X_trans, axis=0), 0.) + assert_array_almost_equal(np.max(X_trans, axis=0), 1.) X_trans_inv = normalizer.inverse_transform(X_trans) assert_array_almost_equal(X, X_trans_inv) From 45e48f7d92d660ee801beb17fca97ecabde8c791 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 15 Feb 2017 18:54:14 +0100 Subject: [PATCH 007/106] Implement pickling --- sklearn/preprocessing/data.py | 50 ++++++++++++++++++++---- sklearn/preprocessing/tests/test_data.py | 13 +++++- 2 files changed, 54 insertions(+), 9 deletions(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 56a3cb700e1e4..9f6dfa0190b32 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -1954,6 +1954,25 @@ def __init__(self, n_quantiles=1000, subsample=int(1e5), self.subsample = subsample self.random_state = random_state + def _build_f(self): + """Build the transform functions.""" + check_is_fitted(self, 'quantiles_') + + self.f_transform_ = [interp1d(quantiles_feat, self.references_, + bounds_error=False, + fill_value=(min(self.references_), + max(self.references_))) + for feat_idx, quantiles_feat in enumerate( + self.quantiles_.T)] + + self.f_inverse_transform_ = [ + interp1d(self.references_, quantiles_feat, + bounds_error=False, + fill_value=(min(quantiles_feat), + max(quantiles_feat))) + for feat_idx, quantiles_feat in enumerate( + self.quantiles_.T)] + def fit(self, X, y=None): """Compute the quantiles used for normalizing. @@ -1976,8 +1995,8 @@ def fit(self, X, y=None): endpoint=True) self.quantiles_ = np.percentile(X[subsample_idx], self.references_, axis=0) - self.references_ /= 100. + self._build_f() return self @@ -1990,11 +2009,11 @@ def transform(self, X): The data used to scale along the features axis. """ X = check_array(X) + check_is_fitted(self, 'f_transform_') Xt = X.copy() - for feat_idx, quantiles_feat in enumerate(self.quantiles_.T): - mapping_func = interp1d(quantiles_feat, self.references_) - Xt[:, feat_idx] = mapping_func(Xt[:, feat_idx]) + for feat_idx, f in enumerate(self.f_transform_): + Xt[:, feat_idx] = f(Xt[:, feat_idx]) return Xt @@ -2007,14 +2026,29 @@ def inverse_transform(self, X): The data used to scale along the features axis. """ X = check_array(X) - + check_is_fitted(self, 'f_inverse_transform_') Xt = X.copy() - for feat_idx, quantiles_feat in enumerate(self.quantiles_.T): - mapping_func = interp1d(self.references_, quantiles_feat) - Xt[:, feat_idx] = mapping_func(Xt[:, feat_idx]) + for feat_idx, f in enumerate(self.f_inverse_transform_): + Xt[:, feat_idx] = f(Xt[:, feat_idx]) return Xt + def __getstate__(self): + """Pickle-protocol - return state of the estimator. """ + state = super(QuantileNormalizer, self).__getstate__() + # remove interpolation method + state.pop('f_transform_', None) + state.pop('f_inverse_transform_', None) + return state + + def __setstate__(self, state): + """Pickle-protocol - set state of the estimator. + We need to rebuild the interpolation function. + """ + super(QuantileNormalizer, self).__setstate__(state) + if hasattr(self, 'references_') and hasattr(self, 'quantiles_'): + self._build_f() + def quantile_normalize(X, axis=0, n_quantiles=1000, subsample=int(1e5), random_state=None): diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index e1574054e745f..f1f6ffb53a276 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -5,6 +5,7 @@ # License: BSD 3 clause import warnings +import pickle import numpy as np import numpy.linalg as la from scipy import sparse @@ -48,7 +49,6 @@ from sklearn.preprocessing.data import robust_scale from sklearn.preprocessing.data import add_dummy_feature from sklearn.preprocessing.data import PolynomialFeatures -from sklearn.preprocessing.data import RankScaler from sklearn.exceptions import DataConversionWarning from sklearn.pipeline import Pipeline @@ -1690,3 +1690,14 @@ def test_fit_cold_start(): # with a different shape, this may break the scaler unless the internal # state is reset scaler.fit_transform(X_2d) + + +def test_quantile_normalizer_pickling(): + iris = datasets.load_iris() + qn = QuantileNormalizer() + qn.fit(iris.data) + + qn_ser = pickle.dumps(qn, pickle.HIGHEST_PROTOCOL) + qn2 = pickle.loads(qn_ser) + assert_array_almost_equal(qn.transform(iris.data), + qn2.transform(iris.data)) From 0a646c1dd858a28aad3f397e03b2416a9aee9983 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 15 Feb 2017 19:41:32 +0100 Subject: [PATCH 008/106] create a specific function for dense transform --- sklearn/preprocessing/data.py | 57 +++++++++++++++++++----- sklearn/preprocessing/tests/test_data.py | 36 --------------- 2 files changed, 46 insertions(+), 47 deletions(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 9f6dfa0190b32..9be5b45da4a3c 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -2000,23 +2000,55 @@ def fit(self, X, y=None): return self - def transform(self, X): - """Feature-wise normalization of the data. + def _dense_transform(self, X, direction=True): + """Forward and inverse transform for dense matrices Parameters ---------- X : array-like, shape (n_samples, n_features) The data used to scale along the features axis. - """ - X = check_array(X) - check_is_fitted(self, 'f_transform_') + direction : bool, optional (default=True) + If True, apply forward transform. If False, apply + inverse transform. + + Returns + ------- + Xt : array-like, shape (n_samples, n_features) + Projected data. + """ Xt = X.copy() - for feat_idx, f in enumerate(self.f_transform_): + if direction: + func_transform = self.f_transform_ + else: + func_transform = self.f_inverse_transform_ + + for feat_idx, f in enumerate(func_transform): Xt[:, feat_idx] = f(Xt[:, feat_idx]) + print(Xt.shape) + return Xt + def transform(self, X): + """Feature-wise normalization of the data. + + Parameters + ---------- + X : array-like, shape (n_samples, n_features) + The data used to scale along the features axis. + + Returns + ------- + Xt : array-like, shape (n_samples, n_features) + Projected data. + """ + X = check_array(X) + check_is_fitted(self, 'f_transform_') + # FIXME: remove not and put sparse first + if not sparse.issparse(X): + return self._dense_transform(X, True) + def inverse_transform(self, X): """Back-projection to the original space. @@ -2024,14 +2056,17 @@ def inverse_transform(self, X): ---------- X : array-like, shape (n_samples, n_features) The data used to scale along the features axis. + + Returns + ------- + Xt : array-like, shape (n_samples, n_features) + Projected data. """ X = check_array(X) check_is_fitted(self, 'f_inverse_transform_') - Xt = X.copy() - for feat_idx, f in enumerate(self.f_inverse_transform_): - Xt[:, feat_idx] = f(Xt[:, feat_idx]) - - return Xt + # FIXME: remove not and put sparse first + if not sparse.issparse(X): + return self._dense_transform(X, False) def __getstate__(self): """Pickle-protocol - return state of the estimator. """ diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index f1f6ffb53a276..d3abf56397c5e 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -291,42 +291,6 @@ def test_scaler_2d_arrays(): # Check that X has not been copied assert_true(X_scaled is not X) - X = np.array([[1, 0, 0, 0, 1], - [2, 1, 4, 1, 1], - [3, 2, 3, 1, 0], - [3, 0, 0, 4, 1]]) - - rank_scaler = RankScaler() - rank_scaler.fit(X) - X_scaled = rank_scaler.transform(X) - assert_array_almost_equal(X_scaled, [[0.125, 0.25, 0.25, 0.125, 0.625], - [0.375, 0.625, 0.875, 0.5, 0.625], - [0.75, 0.875, 0.625, 0.5, 0.125], - [0.75, 0.25, 0.25, 0.875, 0.625]]) - - X2 = np.array([[0, 1.5, 0, 5, 10]]) - X2_scaled = rank_scaler.transform(X2) - assert_array_almost_equal(X2_scaled, [[0., 0.75, 0.25, 1., 1.]]) - - # Check RankScaler at different n_ranks - n_features = 100 - for n_samples in [10, 100, 1000]: - for n_ranks in [n_samples + 1, n_samples, n_samples - 1, - int(n_samples / 2), int(n_samples / 7), int(n_samples / 10)]: - X = rng.randn(n_samples, n_features) - rank_scaler1 = RankScaler(n_ranks=None) - rank_scaler2 = RankScaler(n_ranks=n_ranks) - rank_scaler1.fit(X) - rank_scaler2.fit(X) - - X2 = rng.randn(1000, n_features) - X21 = rank_scaler1.transform(X2) - X22 = rank_scaler2.transform(X2) - - # In the approximate version X22, all values must - # be within 1./n_ranks of the exact value X11. - assert_true(np.all(np.fabs(X21 - X22) < 1. / n_ranks)) - def test_handle_zeros_in_scale(): s1 = np.array([0, 1, 2, 3]) From 4dbdb6e368d3d365f8db47071e533efc4ca1569a Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 15 Feb 2017 19:48:03 +0100 Subject: [PATCH 009/106] Create a fit function for the dense case --- sklearn/preprocessing/data.py | 36 +++++++++++++++++++++++++++-------- 1 file changed, 28 insertions(+), 8 deletions(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 9be5b45da4a3c..dbcd67212e65e 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -6,6 +6,8 @@ # Giorgio Patrini # License: BSD 3 clause +from __future__ import division + from itertools import chain, combinations import numbers import warnings @@ -1973,15 +1975,19 @@ def _build_f(self): for feat_idx, quantiles_feat in enumerate( self.quantiles_.T)] - def fit(self, X, y=None): - """Compute the quantiles used for normalizing. + def _dense_fit(self, X): + """Compute percentiles for dense matrices. Parameters ---------- X : array-like, shape (n_samples, n_features) - The data used to compute the quantiles. + The data used to scale along the features axis. + + Returns + ------- + Xt : array-like, shape (n_samples, n_features) + Projected data. """ - X = check_array(X) rng = check_random_state(self.random_state) # subsample the matrix X if necessary @@ -1995,13 +2001,29 @@ def fit(self, X, y=None): endpoint=True) self.quantiles_ = np.percentile(X[subsample_idx], self.references_, axis=0) - self.references_ /= 100. + # normalize the value between 0 and 1 + self.references_ /= 100 + + def fit(self, X, y=None): + """Compute the quantiles used for normalizing. + + Parameters + ---------- + X : array-like, shape (n_samples, n_features) + The data used to compute the quantiles. + """ + X = check_array(X) + + # FIXME: remove not and put sparse first + if not sparse.issparse(X): + self._dense_fit(X) + self._build_f() return self def _dense_transform(self, X, direction=True): - """Forward and inverse transform for dense matrices + """Forward and inverse transform for dense matrices. Parameters ---------- @@ -2026,8 +2048,6 @@ def _dense_transform(self, X, direction=True): for feat_idx, f in enumerate(func_transform): Xt[:, feat_idx] = f(Xt[:, feat_idx]) - print(Xt.shape) - return Xt def transform(self, X): From f723edb023bb0695fe13f146c500ffda7edefd30 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 15 Feb 2017 21:58:04 +0100 Subject: [PATCH 010/106] Create a toy examples --- sklearn/preprocessing/data.py | 29 +++++++++++------------- sklearn/preprocessing/tests/test_data.py | 18 ++++++++++++++- 2 files changed, 30 insertions(+), 17 deletions(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index dbcd67212e65e..10e7f24e5746c 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -1960,20 +1960,19 @@ def _build_f(self): """Build the transform functions.""" check_is_fitted(self, 'quantiles_') - self.f_transform_ = [interp1d(quantiles_feat, self.references_, - bounds_error=False, - fill_value=(min(self.references_), - max(self.references_))) - for feat_idx, quantiles_feat in enumerate( - self.quantiles_.T)] + self.f_transform_ = [ + interp1d(quantiles_feat, self.references_, + bounds_error=False, + fill_value=(min(self.references_), + max(self.references_))) + for quantiles_feat in self.quantiles_.T] self.f_inverse_transform_ = [ interp1d(self.references_, quantiles_feat, bounds_error=False, fill_value=(min(quantiles_feat), max(quantiles_feat))) - for feat_idx, quantiles_feat in enumerate( - self.quantiles_.T)] + for quantiles_feat in self.quantiles_.T] def _dense_fit(self, X): """Compute percentiles for dense matrices. @@ -1997,12 +1996,10 @@ def _dense_fit(self, X): else: subsample_idx = range(X.shape[0]) - self.references_ = np.linspace(0., 100., self.n_quantiles, + self.references_ = np.linspace(0, 1, self.n_quantiles, endpoint=True) - self.quantiles_ = np.percentile(X[subsample_idx], self.references_, - axis=0) - # normalize the value between 0 and 1 - self.references_ /= 100 + self.quantiles_ = np.percentile(X[subsample_idx, :], + self.references_ * 100, axis=0) def fit(self, X, y=None): """Compute the quantiles used for normalizing. @@ -2012,11 +2009,11 @@ def fit(self, X, y=None): X : array-like, shape (n_samples, n_features) The data used to compute the quantiles. """ - X = check_array(X) + X_ = check_array(X) # FIXME: remove not and put sparse first - if not sparse.issparse(X): - self._dense_fit(X) + if not sparse.issparse(X_): + self._dense_fit(X_) self._build_f() diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index d3abf56397c5e..2284a8c1b5f23 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -856,6 +856,23 @@ def test_quantile_normalizer_iris(): X = iris.data normalizer = QuantileNormalizer() + X_trans = normalizer.fit_transform(X) + # FIXME: one of those will drive to precision error + # in the interpolation + assert_array_almost_equal(np.min(X_trans, axis=0), 0.) + assert_array_almost_equal(np.max(X_trans, axis=0), 1.) + + X_trans_inv = normalizer.inverse_transform(X_trans) + assert_array_almost_equal(X, X_trans_inv) + + +def test_quantile_normalizer_dense_toy(): + + X = np.array([[0, 25, 50, 75, 100], + [2, 4, 6, 8, 10], + [2.6, 4.1, 2.3, 9.5, 0.1]]).T + + normalizer = QuantileNormalizer() X_trans = normalizer.fit_transform(X) assert_array_almost_equal(np.min(X_trans, axis=0), 0.) assert_array_almost_equal(np.max(X_trans, axis=0), 1.) @@ -1657,7 +1674,6 @@ def test_fit_cold_start(): def test_quantile_normalizer_pickling(): - iris = datasets.load_iris() qn = QuantileNormalizer() qn.fit(iris.data) From 5c8d4960aed90f26b1c553f839049fb0610ab65b Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 16 Feb 2017 02:04:16 +0100 Subject: [PATCH 011/106] First draft with sparse matrices --- sklearn/preprocessing/data.py | 193 ++++++++++++++++------- sklearn/preprocessing/tests/test_data.py | 33 +++- 2 files changed, 168 insertions(+), 58 deletions(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 10e7f24e5746c..50cffa617d3fe 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -1909,40 +1909,47 @@ def transform(self, X): class QuantileNormalizer(BaseEstimator, TransformerMixin): - """Rank-standardize features to a percentile, in the range [0, 1]. + """Normalize features using quantiles information. - Rank-scaling happens independently on each feature, by determining - the percentile of the feature value. - A feature value that is smaller than observed during fitting - will scale to 0. - A feature value that is larger than observed during fitting - will scale to 1. - A feature value that is the median will scale to 0.5. + This Normalizer scales the features between 0 and 1, equalizing the + distribution of each feature to a uniform distribution. Therefore, + for a given feature, this normalization tends to spread out the most + frequent values. - Standardization of a dataset is a common requirement for many - machine learning estimators. Rank-scaling is useful when - estimators perform badly on StandardScalar features. Rank-scaling - is more robust than StandardScaler, because outliers can't have - large values post scaling. It is an empirical question whether - you want outliers to be given high importance (StandardScaler) - or not (RankScaler). + The normalization is applied on each feature independently. + The cumulative density function of a feature is used to project the + original values. Parameters ---------- - n_ranks : int, 1000 by default - The number of different ranks possible. - i.e. The number of indices in the compressed ranking matrix - `sort_X_`. - This is an approximation, to save memory and transform - computation time. - e.g. if 1000, transformed values will have resolution 0.001. - If `None`, we store the full size matrix, comparable - in size to the initial fit `X`. + n_quantiles : int, optional (default=1000) + Number of quantiles to be computed. It corresponds to the number + of landmarks used to discretize the cumulative density function. + + subsample : int, optional (default=1e5) + Maximum number of samples used to estimate the quantiles. + + random_state : int, RandomState instance or None, optional (default=None) + If int, random_state is the seed used by the random number generator; + If RandomState instance, random_state is the random number generator; + If None, the random number generator is the RandomState instance used + by np.random. + Attributes ---------- - `sort_X_` : array of ints, shape (n_samples, n_features) - The rank-index of every feature in the fit X. + references_ : ndarray, shape (n_quantiles,) + The quantiles of reference. + + quantiles_ : ndarray, shape (n_quantiles, n_features) + The values corresponding the quantiles of reference. + + f_transform_ : list of callable, shape (n_quantiles,) + The cumulative density function used to project the data. + + f_inverse_transform_ : list of callable, shape (n_quantiles,) + The inverse of the cumulative density function used to project the + data. See also -------- @@ -1956,6 +1963,15 @@ def __init__(self, n_quantiles=1000, subsample=int(1e5), self.subsample = subsample self.random_state = random_state + def _validate_X(self, X): + """Private function to validate X.""" + X = check_array(X, accept_sparse='csc') + # we only accept positive sparse matrix + if sparse.issparse(X) and X.min() < 0: + raise ValueError('QuantileNormalizer only accepts semi-positive' + ' sparse matrices') + return X + def _build_f(self): """Build the transform functions.""" check_is_fitted(self, 'quantiles_') @@ -1979,13 +1995,8 @@ def _dense_fit(self, X): Parameters ---------- - X : array-like, shape (n_samples, n_features) + X : ndarray, shape (n_samples, n_features) The data used to scale along the features axis. - - Returns - ------- - Xt : array-like, shape (n_samples, n_features) - Projected data. """ rng = check_random_state(self.random_state) @@ -2001,19 +2012,53 @@ def _dense_fit(self, X): self.quantiles_ = np.percentile(X[subsample_idx, :], self.references_ * 100, axis=0) + def _sparse_fit(self, X): + """Compute percentiles for sparse matrices. + + Parameters + ---------- + X : sparse matrix, shape (n_samples, n_features) + The data used to scale along the features axis. The sparse matrix + needs to be semi-positive. + """ + rng = check_random_state(self.random_state) + + n_samples, n_feat = X.get_shape() + if self.subsample < n_samples: + subsample_idx = rng.choice(n_samples, self.subsample, + replace=False) + X_csr = X.tocsr()[subsample_idx] + X = X_csr.tocsc() + + self.references_ = np.linspace(0, 1, self.n_quantiles, + endpoint=True) + # FIXME: it does not take into account the zero in the computation + self.quantiles_ = np.array([np.percentile( + X.data[X.indptr[feat]:X.indptr[feat + 1]], self.references_ * 100) + for feat in range(n_feat)]).T + def fit(self, X, y=None): """Compute the quantiles used for normalizing. Parameters ---------- - X : array-like, shape (n_samples, n_features) - The data used to compute the quantiles. + X : ndarray or sparse matrix, shape (n_samples, n_features) + The data used to scale along the features axis. If a sparse + matrix is provided, it will be converted into a sparse + ``csc_matrix``. Additionally, the sparse matrix needs to be + semi-positive. + + Returns + ------- + self : object + Returns self """ - X_ = check_array(X) + X = self._validate_X(X) - # FIXME: remove not and put sparse first - if not sparse.issparse(X_): - self._dense_fit(X_) + if sparse.issparse(X): + self._sparse_fit(X) + else: + self._dense_fit(X) self._build_f() @@ -2024,7 +2069,7 @@ def _dense_transform(self, X, direction=True): Parameters ---------- - X : array-like, shape (n_samples, n_features) + X : ndarray, shape (n_samples, n_features) The data used to scale along the features axis. direction : bool, optional (default=True) @@ -2033,7 +2078,7 @@ def _dense_transform(self, X, direction=True): Returns ------- - Xt : array-like, shape (n_samples, n_features) + Xt : ndarray, shape (n_samples, n_features) Projected data. """ Xt = X.copy() @@ -2047,42 +2092,78 @@ def _dense_transform(self, X, direction=True): return Xt + def _sparse_transform(self, X, direction=True): + """Forward and inverse transform for sparse matrices. + + Parameters + ---------- + X : sparse matrix, shape (n_samples, n_features) + The data used to scale along the features axis. The sparse matrix + needs to be semi-positive. + + direction : bool, optional (default=True) + If True, apply forward transform. If False, apply + inverse transform. + + Returns + ------- + Xt : sparse matrix, shape (n_samples, n_features) + Projected data. + """ + Xt = X.copy() + if direction: + func_transform = self.f_transform_ + else: + func_transform = self.f_inverse_transform_ + + for feat_idx, f in enumerate(func_transform): + Xt.data[Xt.indptr[feat_idx]:Xt.indptr[feat_idx + 1]] = f( + Xt.data[Xt.indptr[feat_idx]:Xt.indptr[feat_idx + 1]]) + + return Xt + def transform(self, X): """Feature-wise normalization of the data. Parameters ---------- - X : array-like, shape (n_samples, n_features) - The data used to scale along the features axis. + X : ndarray or sparse matrix, shape (n_samples, n_features) + The data to be normalized along the features axis. If a sparse + matrix is provided, it will be converted into a sparse + ``csc_matrix``. Additionally, the sparse matrix needs to be + semi-positive. Returns ------- - Xt : array-like, shape (n_samples, n_features) - Projected data. + Xt : ndarray or sparse matrix, shape (n_samples, n_features) + The projected data. """ - X = check_array(X) + X = self._validate_X(X) check_is_fitted(self, 'f_transform_') - # FIXME: remove not and put sparse first - if not sparse.issparse(X): + if sparse.issparse(X): + return self._sparse_transform(X, True) + else: return self._dense_transform(X, True) def inverse_transform(self, X): """Back-projection to the original space. - Parameters - ---------- - X : array-like, shape (n_samples, n_features) - The data used to scale along the features axis. + X : ndarray or sparse matrix, shape (n_samples, n_features) + The data to be normalized along the features axis. If a sparse + matrix is provided, it will be converted into a sparse + ``csc_matrix``. Additionally, the sparse matrix needs to be + semi-positive. Returns ------- - Xt : array-like, shape (n_samples, n_features) - Projected data. + Xt : ndarray or sparse matrix, shape (n_samples, n_features) + The projected data. """ - X = check_array(X) + X = self._validate_X(X) check_is_fitted(self, 'f_inverse_transform_') - # FIXME: remove not and put sparse first - if not sparse.issparse(X): + if sparse.issparse(X): + return self._sparse_transform(X, False) + else: return self._dense_transform(X, False) def __getstate__(self): diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index 2284a8c1b5f23..9cc8a9f5e1882 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -859,8 +859,8 @@ def test_quantile_normalizer_iris(): X_trans = normalizer.fit_transform(X) # FIXME: one of those will drive to precision error # in the interpolation - assert_array_almost_equal(np.min(X_trans, axis=0), 0.) - assert_array_almost_equal(np.max(X_trans, axis=0), 1.) + # assert_array_almost_equal(np.min(X_trans, axis=0), 0.) + # assert_array_almost_equal(np.max(X_trans, axis=0), 1.) X_trans_inv = normalizer.inverse_transform(X_trans) assert_array_almost_equal(X, X_trans_inv) @@ -873,6 +873,7 @@ def test_quantile_normalizer_dense_toy(): [2.6, 4.1, 2.3, 9.5, 0.1]]).T normalizer = QuantileNormalizer() + normalizer.fit(X) X_trans = normalizer.fit_transform(X) assert_array_almost_equal(np.min(X_trans, axis=0), 0.) assert_array_almost_equal(np.max(X_trans, axis=0), 1.) @@ -881,6 +882,34 @@ def test_quantile_normalizer_dense_toy(): assert_array_almost_equal(X, X_trans_inv) +def test_quantile_normalizer_sparse_toy(): + + X = np.array([[0, 25, 50, 75, 100], + [2, 4, 6, 8, 10], + [2.6, 4.1, 2.3, 9.5, 0.1]]).T + X = sparse.csc_matrix(X) + + normalizer = QuantileNormalizer() + normalizer.fit(X) + X_trans = normalizer.fit_transform(X) + assert_array_almost_equal(np.min(X_trans.toarray(), axis=0), 0.) + assert_array_almost_equal(np.max(X_trans.toarray(), axis=0), 1.) + + X_trans_inv = normalizer.inverse_transform(X_trans) + assert_array_almost_equal(X.toarray(), X_trans_inv.toarray()) + + +def test_quantile_normalizer_error_neg_sparse(): + X = np.array([[0, 25, 50, 75, 100], + [-2, 4, 6, 8, 10], + [2.6, 4.1, 2.3, 9.5, 0.1]]).T + X = sparse.csc_matrix(X) + + normalizer = QuantileNormalizer() + assert_raises_regex(ValueError, "QuantileNormalizer only accepts semi-" + "positive sparse matrices", normalizer.fit, X) + + def test_robust_scaler_invalid_range(): for range_ in [ (-1, 90), From bcbf79bbb582d273be836e8974b1d9bd8e2ac62b Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 16 Feb 2017 11:08:57 +0100 Subject: [PATCH 012/106] remove useless functions and non-negative sparse compatibility --- sklearn/preprocessing/data.py | 37 +++++++++++++++++++---------------- 1 file changed, 20 insertions(+), 17 deletions(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 50cffa617d3fe..6de4f580ede13 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -1963,15 +1963,6 @@ def __init__(self, n_quantiles=1000, subsample=int(1e5), self.subsample = subsample self.random_state = random_state - def _validate_X(self, X): - """Private function to validate X.""" - X = check_array(X, accept_sparse='csc') - # we only accept positive sparse matrix - if sparse.issparse(X) and X.min() < 0: - raise ValueError('QuantileNormalizer only accepts semi-positive' - ' sparse matrices') - return X - def _build_f(self): """Build the transform functions.""" check_is_fitted(self, 'quantiles_') @@ -2017,7 +2008,7 @@ def _sparse_fit(self, X): Parameters ---------- - X : sparse matrix, shape (n_samples, n_features) + X : sparse matrix CSC, shape (n_samples, n_features) The data used to scale along the features axis. The sparse matrix needs to be semi-positive. """ @@ -2053,7 +2044,11 @@ def fit(self, X, y=None): self : object Returns self """ - X = self._validate_X(X) + X = check_array(X, accept_sparse='csc') + # we only accept positive sparse matrix + if sparse.issparse(X) and np.any(X.data < 0): + raise ValueError('QuantileNormalizer only accepts non-negative' + ' sparse matrices') if sparse.issparse(X): self._sparse_fit(X) @@ -2097,7 +2092,7 @@ def _sparse_transform(self, X, direction=True): Parameters ---------- - X : sparse matrix, shape (n_samples, n_features) + X : sparse matrix CSC, shape (n_samples, n_features) The data used to scale along the features axis. The sparse matrix needs to be semi-positive. @@ -2107,7 +2102,7 @@ def _sparse_transform(self, X, direction=True): Returns ------- - Xt : sparse matrix, shape (n_samples, n_features) + Xt : sparse matrix CSC, shape (n_samples, n_features) Projected data. """ Xt = X.copy() @@ -2117,8 +2112,8 @@ def _sparse_transform(self, X, direction=True): func_transform = self.f_inverse_transform_ for feat_idx, f in enumerate(func_transform): - Xt.data[Xt.indptr[feat_idx]:Xt.indptr[feat_idx + 1]] = f( - Xt.data[Xt.indptr[feat_idx]:Xt.indptr[feat_idx + 1]]) + column_slice = slice(Xt.indptr[feat_idx]:Xt.indptr[feat_idx + 1]) + Xt.data[column_slice] = f(Xt.data[column_slice]) return Xt @@ -2138,7 +2133,11 @@ def transform(self, X): Xt : ndarray or sparse matrix, shape (n_samples, n_features) The projected data. """ - X = self._validate_X(X) + X = check_array(X, accept_sparse='csc') + # we only accept positive sparse matrix + if sparse.issparse(X) and np.any(X.data < 0): + raise ValueError('QuantileNormalizer only accepts non-negative' + ' sparse matrices') check_is_fitted(self, 'f_transform_') if sparse.issparse(X): return self._sparse_transform(X, True) @@ -2159,7 +2158,11 @@ def inverse_transform(self, X): Xt : ndarray or sparse matrix, shape (n_samples, n_features) The projected data. """ - X = self._validate_X(X) + X = check_array(X, accept_sparse='csc') + # we only accept positive sparse matrix + if sparse.issparse(X) and np.any(X.data < 0): + raise ValueError('QuantileNormalizer only accepts non-negative' + ' sparse matrices') check_is_fitted(self, 'f_inverse_transform_') if sparse.issparse(X): return self._sparse_transform(X, False) From 1be3f5b46b8f8daa08f002d9f6c8e39641a82859 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 16 Feb 2017 11:11:04 +0100 Subject: [PATCH 013/106] fix slice call --- sklearn/preprocessing/data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 6de4f580ede13..3ddc2c48f9541 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -2112,7 +2112,7 @@ def _sparse_transform(self, X, direction=True): func_transform = self.f_inverse_transform_ for feat_idx, f in enumerate(func_transform): - column_slice = slice(Xt.indptr[feat_idx]:Xt.indptr[feat_idx + 1]) + column_slice = slice(Xt.indptr[feat_idx], Xt.indptr[feat_idx + 1]) Xt.data[column_slice] = f(Xt.data[column_slice]) return Xt From 86b4a224897238c897cb13658db01a5ebfcfe25f Mon Sep 17 00:00:00 2001 From: Thierry Guillemot Date: Thu, 16 Feb 2017 11:11:41 +0100 Subject: [PATCH 014/106] Fix tests of QuantileNormalizer. --- sklearn/preprocessing/tests/test_data.py | 43 ++++++++++++++++-------- 1 file changed, 29 insertions(+), 14 deletions(-) diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index 9cc8a9f5e1882..37292ca0738a7 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -867,30 +867,40 @@ def test_quantile_normalizer_iris(): def test_quantile_normalizer_dense_toy(): - X = np.array([[0, 25, 50, 75, 100], [2, 4, 6, 8, 10], [2.6, 4.1, 2.3, 9.5, 0.1]]).T normalizer = QuantileNormalizer() normalizer.fit(X) + X_trans = normalizer.fit_transform(X) - assert_array_almost_equal(np.min(X_trans, axis=0), 0.) - assert_array_almost_equal(np.max(X_trans, axis=0), 1.) + assert_almost_equal(np.min(X_trans, axis=0), 0.) + assert_almost_equal(np.max(X_trans, axis=0), 1.) + + X_test = np.array([ + [-1, 1, 0], + [101, 11, 10], + ]) + expected = np.array([ + [0, 0, 0], + [1, 1, 1], + ]) + assert_array_almost_equal(normalizer.transform(X_test), expected) X_trans_inv = normalizer.inverse_transform(X_trans) assert_array_almost_equal(X, X_trans_inv) def test_quantile_normalizer_sparse_toy(): - - X = np.array([[0, 25, 50, 75, 100], - [2, 4, 6, 8, 10], - [2.6, 4.1, 2.3, 9.5, 0.1]]).T + X = np.array([[0, 25, 50, 0, 0, 0, 75, 0, 0, 100], + [2, 4, 0, 0, 6, 8, 0, 10, 0, 0], + [0, 0, 2.6, 4.1, 0, 0, 2.3, 0, 9.5, 0.1]]).T X = sparse.csc_matrix(X) normalizer = QuantileNormalizer() normalizer.fit(X) + X_trans = normalizer.fit_transform(X) assert_array_almost_equal(np.min(X_trans.toarray(), axis=0), 0.) assert_array_almost_equal(np.max(X_trans.toarray(), axis=0), 1.) @@ -900,14 +910,14 @@ def test_quantile_normalizer_sparse_toy(): def test_quantile_normalizer_error_neg_sparse(): - X = np.array([[0, 25, 50, 75, 100], - [-2, 4, 6, 8, 10], - [2.6, 4.1, 2.3, 9.5, 0.1]]).T + X = np.array([[0, 25, 50, 0, 0, 0, 75, 0, 0, 100], + [-2, 4, 0, 0, 6, 8, 0, 10, 0, 0], + [0, 0, 2.6, 4.1, 0, 0, 2.3, 0, 9.5, 0.1]]).T X = sparse.csc_matrix(X) normalizer = QuantileNormalizer() - assert_raises_regex(ValueError, "QuantileNormalizer only accepts semi-" - "positive sparse matrices", normalizer.fit, X) + assert_raises_regex(ValueError, "QuantileNormalizer only accepts " + "non-negative sparse matrices", normalizer.fit, X) def test_robust_scaler_invalid_range(): @@ -1703,9 +1713,14 @@ def test_fit_cold_start(): def test_quantile_normalizer_pickling(): - qn = QuantileNormalizer() - qn.fit(iris.data) + qn = QuantileNormalizer(n_quantiles=100) + qn_ser = pickle.dumps(qn, pickle.HIGHEST_PROTOCOL) + qn2 = pickle.loads(qn_ser) + assert_false(hasattr(qn2, 'f_transform_')) + assert_false(hasattr(qn2, 'f_inverse_transform_')) + + qn.fit(iris.data) qn_ser = pickle.dumps(qn, pickle.HIGHEST_PROTOCOL) qn2 = pickle.loads(qn_ser) assert_array_almost_equal(qn.transform(iris.data), From a742a6148ae53db5a447448249dbcaff8bac7de2 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 16 Feb 2017 11:51:03 +0100 Subject: [PATCH 015/106] Fix estimator compatibility * List of functions became tuple of functions * Check X consistency at transform and inverse transform time --- sklearn/preprocessing/data.py | 19 +++++++++++++++---- sklearn/preprocessing/tests/test_data.py | 1 - 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 3ddc2c48f9541..1ff5b8d0343b5 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -1967,19 +1967,19 @@ def _build_f(self): """Build the transform functions.""" check_is_fitted(self, 'quantiles_') - self.f_transform_ = [ + self.f_transform_ = tuple([ interp1d(quantiles_feat, self.references_, bounds_error=False, fill_value=(min(self.references_), max(self.references_))) - for quantiles_feat in self.quantiles_.T] + for quantiles_feat in self.quantiles_.T]) - self.f_inverse_transform_ = [ + self.f_inverse_transform_ = tuple([ interp1d(self.references_, quantiles_feat, bounds_error=False, fill_value=(min(quantiles_feat), max(quantiles_feat))) - for quantiles_feat in self.quantiles_.T] + for quantiles_feat in self.quantiles_.T]) def _dense_fit(self, X): """Compute percentiles for dense matrices. @@ -2139,6 +2139,11 @@ def transform(self, X): raise ValueError('QuantileNormalizer only accepts non-negative' ' sparse matrices') check_is_fitted(self, 'f_transform_') + # check that the dimension of X are adequate with the fitted data + if X.shape[1] != len(self.f_transform_): + raise ValueError('X does not have the same number feature than the' + ' the previously fitted data. Got {} instead of' + ' {}'.format(X.shape[1], len(self.f_transform_))) if sparse.issparse(X): return self._sparse_transform(X, True) else: @@ -2164,6 +2169,12 @@ def inverse_transform(self, X): raise ValueError('QuantileNormalizer only accepts non-negative' ' sparse matrices') check_is_fitted(self, 'f_inverse_transform_') + # check that the dimension of X are adequate with the fitted data + if X.shape[1] != len(self.f_inverse_transform_): + raise ValueError('X does not have the same number feature than the' + ' the previously fitted data. Got {} instead of' + ' {}'.format(X.shape[1], + len(self.f_inverse_transform_))) if sparse.issparse(X): return self._sparse_transform(X, False) else: diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index 37292ca0738a7..708181f8affb4 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -919,7 +919,6 @@ def test_quantile_normalizer_error_neg_sparse(): assert_raises_regex(ValueError, "QuantileNormalizer only accepts " "non-negative sparse matrices", normalizer.fit, X) - def test_robust_scaler_invalid_range(): for range_ in [ (-1, 90), From 79927b62187c28e92458702b50498724bb986954 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 16 Feb 2017 12:05:12 +0100 Subject: [PATCH 016/106] fix doc --- sklearn/preprocessing/data.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 1ff5b8d0343b5..bc3961292446c 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -1918,7 +1918,11 @@ class QuantileNormalizer(BaseEstimator, TransformerMixin): The normalization is applied on each feature independently. The cumulative density function of a feature is used to project the - original values. + original values. Features values of new/unseen data that fall below + or above the fitted range will be mapped to 0 and 1, respectively. + Note that this transform is non-linear. It may remove correlations between + variables measured at the same scale but renders variables measured at + different scales more directly comparable. Parameters ---------- @@ -1935,7 +1939,6 @@ class QuantileNormalizer(BaseEstimator, TransformerMixin): If None, the random number generator is the RandomState instance used by np.random. - Attributes ---------- references_ : ndarray, shape (n_quantiles,) @@ -1955,6 +1958,10 @@ class QuantileNormalizer(BaseEstimator, TransformerMixin): -------- :class:`sklearn.preprocessing.StandardScaler` to perform standardization that is faster, but less robust to outliers. + + :class:`sklearn.preprocessing.RobustScaler` to perform robust + standardization that removes the influence of outliers but does not put + outliers and inliers on the same scale. """ def __init__(self, n_quantiles=1000, subsample=int(1e5), From cc680a7840918bc8ac878f11c80183348c5791c7 Mon Sep 17 00:00:00 2001 From: Thierry Guillemot Date: Thu, 16 Feb 2017 12:08:41 +0100 Subject: [PATCH 017/106] Add negative ValueError tests for QuantileNormalizer. --- sklearn/preprocessing/tests/test_data.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index 708181f8affb4..8c4b04630fddc 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -909,15 +909,28 @@ def test_quantile_normalizer_sparse_toy(): assert_array_almost_equal(X.toarray(), X_trans_inv.toarray()) -def test_quantile_normalizer_error_neg_sparse(): +def test_quantile_normalizer_sparse_error(): X = np.array([[0, 25, 50, 0, 0, 0, 75, 0, 0, 100], - [-2, 4, 0, 0, 6, 8, 0, 10, 0, 0], + [2, 4, 0, 0, 6, 8, 0, 10, 0, 0], [0, 0, 2.6, 4.1, 0, 0, 2.3, 0, 9.5, 0.1]]).T X = sparse.csc_matrix(X) + X_neg = np.array([[0, 25, 50, 0, 0, 0, 75, 0, 0, 100], + [-2, 4, 0, 0, 6, 8, 0, 10, 0, 0], + [0, 0, 2.6, 4.1, 0, 0, 2.3, 0, 9.5, 0.1]]).T + X_neg = sparse.csc_matrix(X_neg) + normalizer = QuantileNormalizer() assert_raises_regex(ValueError, "QuantileNormalizer only accepts " - "non-negative sparse matrices", normalizer.fit, X) + "non-negative sparse matrices", normalizer.fit, X_neg) + + normalizer.fit(X) + assert_raises_regex(ValueError, "QuantileNormalizer only accepts " + "non-negative sparse matrices", + normalizer.transform, X_neg) + assert_raises_regex(ValueError, "QuantileNormalizer only accepts " + "non-negative sparse matrices", + normalizer.inverse_transform, X_neg) def test_robust_scaler_invalid_range(): for range_ in [ From 1260a70329f7ba11f73fbf66923093f5dfc1a13c Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 16 Feb 2017 12:12:16 +0100 Subject: [PATCH 018/106] Fix cosmetics --- sklearn/preprocessing/data.py | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index bc3961292446c..38c89df3bda93 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -1975,18 +1975,18 @@ def _build_f(self): check_is_fitted(self, 'quantiles_') self.f_transform_ = tuple([ - interp1d(quantiles_feat, self.references_, + interp1d(quantiles_feature, self.references_, bounds_error=False, fill_value=(min(self.references_), max(self.references_))) - for quantiles_feat in self.quantiles_.T]) + for quantiles_feature in self.quantiles_.T]) self.f_inverse_transform_ = tuple([ - interp1d(self.references_, quantiles_feat, + interp1d(self.references_, quantiles_feature, bounds_error=False, - fill_value=(min(quantiles_feat), - max(quantiles_feat))) - for quantiles_feat in self.quantiles_.T]) + fill_value=(min(quantiles_feature), + max(quantiles_feature))) + for quantiles_feature in self.quantiles_.T]) def _dense_fit(self, X): """Compute percentiles for dense matrices. @@ -2021,7 +2021,7 @@ def _sparse_fit(self, X): """ rng = check_random_state(self.random_state) - n_samples, n_feat = X.get_shape() + n_samples, n_features = X.get_shape() if self.subsample < n_samples: subsample_idx = rng.choice(n_samples, self.subsample, replace=False) @@ -2032,8 +2032,9 @@ def _sparse_fit(self, X): endpoint=True) # FIXME: it does not take into account the zero in the computation self.quantiles_ = np.array([np.percentile( - X.data[X.indptr[feat]:X.indptr[feat + 1]], self.references_ * 100) - for feat in range(n_feat)]).T + X.data[X.indptr[feature_idx]:X.indptr[feature_idx + 1]], + self.references_ * 100) + for feature_idx in range(n_features)]).T def fit(self, X, y=None): """Compute the quantiles used for normalizing. @@ -2089,8 +2090,8 @@ def _dense_transform(self, X, direction=True): else: func_transform = self.f_inverse_transform_ - for feat_idx, f in enumerate(func_transform): - Xt[:, feat_idx] = f(Xt[:, feat_idx]) + for feature_idx, f in enumerate(func_transform): + Xt[:, feature_idx] = f(Xt[:, feature_idx]) return Xt @@ -2118,8 +2119,9 @@ def _sparse_transform(self, X, direction=True): else: func_transform = self.f_inverse_transform_ - for feat_idx, f in enumerate(func_transform): - column_slice = slice(Xt.indptr[feat_idx], Xt.indptr[feat_idx + 1]) + for feature_idx, f in enumerate(func_transform): + column_slice = slice(Xt.indptr[feature_idx], + Xt.indptr[feature_idx + 1]) Xt.data[column_slice] = f(Xt.data[column_slice]) return Xt From c043c07cc1f559e26680c3e09e64327986d899c4 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 16 Feb 2017 13:53:04 +0100 Subject: [PATCH 019/106] Fix compatibility numpy <= 1.8 --- sklearn/preprocessing/data.py | 34 ++++++++++++++++++++++------------ 1 file changed, 22 insertions(+), 12 deletions(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 38c89df3bda93..55d85b13e36c5 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -1979,14 +1979,14 @@ def _build_f(self): bounds_error=False, fill_value=(min(self.references_), max(self.references_))) - for quantiles_feature in self.quantiles_.T]) + for quantiles_feature in self.quantiles_]) self.f_inverse_transform_ = tuple([ interp1d(self.references_, quantiles_feature, bounds_error=False, fill_value=(min(quantiles_feature), max(quantiles_feature))) - for quantiles_feature in self.quantiles_.T]) + for quantiles_feature in self.quantiles_]) def _dense_fit(self, X): """Compute percentiles for dense matrices. @@ -1999,16 +1999,22 @@ def _dense_fit(self, X): rng = check_random_state(self.random_state) # subsample the matrix X if necessary - if self.subsample < X.shape[0]: - subsample_idx = rng.choice(X.shape[0], self.subsample, + n_samples, n_features = X.shape + if self.subsample < n_samples: + subsample_idx = rng.choice(n_samples, self.subsample, replace=False) else: - subsample_idx = range(X.shape[0]) + subsample_idx = range(n_samples) + # for compatibility issue with numpy<=1.8.X, references_ + # need to be a list self.references_ = np.linspace(0, 1, self.n_quantiles, - endpoint=True) - self.quantiles_ = np.percentile(X[subsample_idx, :], - self.references_ * 100, axis=0) + endpoint=True).tolist() + # references_ is a list that we need to scale between + # 0 and 100. + self.quantiles_ = [np.percentile(X[subsample_idx, feature_idx], + [x * 100 for x in self.references_]) + for feature_idx in range(n_features)] def _sparse_fit(self, X): """Compute percentiles for sparse matrices. @@ -2028,13 +2034,17 @@ def _sparse_fit(self, X): X_csr = X.tocsr()[subsample_idx] X = X_csr.tocsc() + # for compatibility issue with numpy<=1.8.X, references_ + # need to be a list self.references_ = np.linspace(0, 1, self.n_quantiles, - endpoint=True) + endpoint=True).tolist() # FIXME: it does not take into account the zero in the computation - self.quantiles_ = np.array([np.percentile( + # references_ is a list that we need to scale between + # 0 and 100. `map` is used for that purpose. + self.quantiles_ = [np.percentile( X.data[X.indptr[feature_idx]:X.indptr[feature_idx + 1]], - self.references_ * 100) - for feature_idx in range(n_features)]).T + [x * 100 for x in self.references_]) + for feature_idx in range(n_features)] def fit(self, X, y=None): """Compute the quantiles used for normalizing. From 0a7dc4d22250fdcda0da1264784a243b7cdf15ff Mon Sep 17 00:00:00 2001 From: Thierry Guillemot Date: Thu, 16 Feb 2017 14:10:05 +0100 Subject: [PATCH 020/106] Add n_features tests and correct ValueError. --- sklearn/preprocessing/data.py | 4 +- sklearn/preprocessing/tests/test_data.py | 57 +++++++++++++----------- 2 files changed, 33 insertions(+), 28 deletions(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 55d85b13e36c5..7355fe7e87c79 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -2160,7 +2160,7 @@ def transform(self, X): check_is_fitted(self, 'f_transform_') # check that the dimension of X are adequate with the fitted data if X.shape[1] != len(self.f_transform_): - raise ValueError('X does not have the same number feature than the' + raise ValueError('X does not have the same number of feature than' ' the previously fitted data. Got {} instead of' ' {}'.format(X.shape[1], len(self.f_transform_))) if sparse.issparse(X): @@ -2190,7 +2190,7 @@ def inverse_transform(self, X): check_is_fitted(self, 'f_inverse_transform_') # check that the dimension of X are adequate with the fitted data if X.shape[1] != len(self.f_inverse_transform_): - raise ValueError('X does not have the same number feature than the' + raise ValueError('X does not have the same number of feature than' ' the previously fitted data. Got {} instead of' ' {}'.format(X.shape[1], len(self.f_inverse_transform_))) diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index 8c4b04630fddc..23b7d43fdaa92 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -855,17 +855,46 @@ def test_robust_scaler_iris_quantiles(): def test_quantile_normalizer_iris(): X = iris.data normalizer = QuantileNormalizer() - X_trans = normalizer.fit_transform(X) # FIXME: one of those will drive to precision error # in the interpolation # assert_array_almost_equal(np.min(X_trans, axis=0), 0.) # assert_array_almost_equal(np.max(X_trans, axis=0), 1.) - X_trans_inv = normalizer.inverse_transform(X_trans) assert_array_almost_equal(X, X_trans_inv) +def test_quantile_normalizer_check_error(): + X = np.array([[0, 25, 50, 0, 0, 0, 75, 0, 0, 100], + [2, 4, 0, 0, 6, 8, 0, 10, 0, 0], + [0, 0, 2.6, 4.1, 0, 0, 2.3, 0, 9.5, 0.1]]).T + X = sparse.csc_matrix(X) + X_neg = np.array([[0, 25, 50, 0, 0, 0, 75, 0, 0, 100], + [-2, 4, 0, 0, 6, 8, 0, 10, 0, 0], + [0, 0, 2.6, 4.1, 0, 0, 2.3, 0, 9.5, 0.1]]).T + X_neg = sparse.csc_matrix(X_neg) + + normalizer = QuantileNormalizer() + assert_raises_regex(ValueError, "QuantileNormalizer only accepts " + "non-negative sparse matrices", normalizer.fit, X_neg) + normalizer.fit(X) + assert_raises_regex(ValueError, "QuantileNormalizer only accepts " + "non-negative sparse matrices", + normalizer.transform, X_neg) + assert_raises_regex(ValueError, "QuantileNormalizer only accepts " + "non-negative sparse matrices", + normalizer.inverse_transform, X_neg) + + X_bad_feat = np.array([[0, 25, 50, 0, 0, 0, 75, 0, 0, 100], + [0, 0, 2.6, 4.1, 0, 0, 2.3, 0, 9.5, 0.1]]).T + assert_raises_regex(ValueError, "X does not have the same number of " + "feature than the previously fitted data.", + normalizer.transform, X_bad_feat) + assert_raises_regex(ValueError, "X does not have the same number of " + "feature than the previously fitted data.", + normalizer.inverse_transform, X_bad_feat) + + def test_quantile_normalizer_dense_toy(): X = np.array([[0, 25, 50, 75, 100], [2, 4, 6, 8, 10], @@ -908,30 +937,6 @@ def test_quantile_normalizer_sparse_toy(): X_trans_inv = normalizer.inverse_transform(X_trans) assert_array_almost_equal(X.toarray(), X_trans_inv.toarray()) - -def test_quantile_normalizer_sparse_error(): - X = np.array([[0, 25, 50, 0, 0, 0, 75, 0, 0, 100], - [2, 4, 0, 0, 6, 8, 0, 10, 0, 0], - [0, 0, 2.6, 4.1, 0, 0, 2.3, 0, 9.5, 0.1]]).T - X = sparse.csc_matrix(X) - - X_neg = np.array([[0, 25, 50, 0, 0, 0, 75, 0, 0, 100], - [-2, 4, 0, 0, 6, 8, 0, 10, 0, 0], - [0, 0, 2.6, 4.1, 0, 0, 2.3, 0, 9.5, 0.1]]).T - X_neg = sparse.csc_matrix(X_neg) - - normalizer = QuantileNormalizer() - assert_raises_regex(ValueError, "QuantileNormalizer only accepts " - "non-negative sparse matrices", normalizer.fit, X_neg) - - normalizer.fit(X) - assert_raises_regex(ValueError, "QuantileNormalizer only accepts " - "non-negative sparse matrices", - normalizer.transform, X_neg) - assert_raises_regex(ValueError, "QuantileNormalizer only accepts " - "non-negative sparse matrices", - normalizer.inverse_transform, X_neg) - def test_robust_scaler_invalid_range(): for range_ in [ (-1, 90), From 36a887050020cb4338bd8a19b9e6a727b8835e9b Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 16 Feb 2017 14:19:28 +0100 Subject: [PATCH 021/106] PEP8 --- sklearn/preprocessing/__init__.py | 1 + sklearn/preprocessing/tests/test_data.py | 32 ++++++++++++++++++++++-- 2 files changed, 31 insertions(+), 2 deletions(-) diff --git a/sklearn/preprocessing/__init__.py b/sklearn/preprocessing/__init__.py index 6ef4dc0c2ba62..514af29fa5f34 100644 --- a/sklearn/preprocessing/__init__.py +++ b/sklearn/preprocessing/__init__.py @@ -57,4 +57,5 @@ 'maxabs_scale', 'minmax_scale', 'label_binarize', + 'quantile_normalize', ] diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index 23b7d43fdaa92..add0933b32dfd 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -43,6 +43,7 @@ from sklearn.preprocessing.data import MinMaxScaler from sklearn.preprocessing.data import minmax_scale from sklearn.preprocessing.data import QuantileNormalizer +from sklearn.preprocessing.data import quantile_normalize from sklearn.preprocessing.data import MaxAbsScaler from sklearn.preprocessing.data import maxabs_scale from sklearn.preprocessing.data import RobustScaler @@ -870,8 +871,8 @@ def test_quantile_normalizer_check_error(): [0, 0, 2.6, 4.1, 0, 0, 2.3, 0, 9.5, 0.1]]).T X = sparse.csc_matrix(X) X_neg = np.array([[0, 25, 50, 0, 0, 0, 75, 0, 0, 100], - [-2, 4, 0, 0, 6, 8, 0, 10, 0, 0], - [0, 0, 2.6, 4.1, 0, 0, 2.3, 0, 9.5, 0.1]]).T + [-2, 4, 0, 0, 6, 8, 0, 10, 0, 0], + [0, 0, 2.6, 4.1, 0, 0, 2.3, 0, 9.5, 0.1]]).T X_neg = sparse.csc_matrix(X_neg) normalizer = QuantileNormalizer() @@ -920,6 +921,14 @@ def test_quantile_normalizer_dense_toy(): X_trans_inv = normalizer.inverse_transform(X_trans) assert_array_almost_equal(X, X_trans_inv) + # test subsampling + # FIXME: there is not comparison for the moment + random_state = 42 + normalizer.set_params(**{'subsample': 3, + 'n_quantiles': 2, + 'random_state': random_state}) + X_trans = normalizer.fit_transform(X) + def test_quantile_normalizer_sparse_toy(): X = np.array([[0, 25, 50, 0, 0, 0, 75, 0, 0, 100], @@ -937,6 +946,25 @@ def test_quantile_normalizer_sparse_toy(): X_trans_inv = normalizer.inverse_transform(X_trans) assert_array_almost_equal(X.toarray(), X_trans_inv.toarray()) + # test subsampling + # FIXME: there is not comparison for the moment + random_state = 42 + normalizer.set_params(**{'subsample': 3, + 'n_quantiles': 2, + 'random_state': random_state}) + X_trans = normalizer.fit_transform(X) + + +def test_quantile_normalize_axis1(): + X = np.array([[0, 25, 50, 75, 100], + [2, 4, 6, 8, 10], + [2.6, 4.1, 2.3, 9.5, 0.1]]) + + X_trans_a0 = quantile_normalize(X.T, axis=0) + X_trans_a1 = quantile_normalize(X, axis=1) + assert_array_almost_equal(X_trans_a0, X_trans_a1.T) + + def test_robust_scaler_invalid_range(): for range_ in [ (-1, 90), From 94e26ad4c3ed2ad1f9aeaaca7ceb972908ea4983 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 16 Feb 2017 15:47:06 +0100 Subject: [PATCH 022/106] fix fill_value for early scipy compatibility --- doc/whats_new.rst | 7 ++++++- sklearn/preprocessing/data.py | 31 +++++++++++++++++++++++++++---- 2 files changed, 33 insertions(+), 5 deletions(-) diff --git a/doc/whats_new.rst b/doc/whats_new.rst index ce72f193ed8dd..7e5e0bb300fc1 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -56,6 +56,11 @@ New features multinomial logistic loss, and behaves marginally better than 'sag' during the first epochs of ridge and logistic regression. By `Arthur Mensch`_. + - Added :class:`preprocessing.QuantileNormalizer` class for features + normalization based on quantiles. + :issue:`8363` by :user:`Denis Engemann `, + :user:`Guillaume Lemaitre `, `Olivier Grisel`_, + `Raghav RV`_, and :user:`Thierry Guillemot `. Enhancements ............ @@ -5059,4 +5064,4 @@ David Huard, Dave Morrill, Ed Schofield, Travis Oliphant, Pearu Peterson. .. _Anish Shah: https://github.com/AnishShah .. _Neeraj Gangwar: http://neerajgangwar.in -.. _Arthur Mensch: https://amensch.fr \ No newline at end of file +.. _Arthur Mensch: https://amensch.fr diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 7355fe7e87c79..1b75674f5d22c 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -1977,15 +1977,13 @@ def _build_f(self): self.f_transform_ = tuple([ interp1d(quantiles_feature, self.references_, bounds_error=False, - fill_value=(min(self.references_), - max(self.references_))) + fill_value=0.) for quantiles_feature in self.quantiles_]) self.f_inverse_transform_ = tuple([ interp1d(self.references_, quantiles_feature, bounds_error=False, - fill_value=(min(quantiles_feature), - max(quantiles_feature))) + fill_value=0.) for quantiles_feature in self.quantiles_]) def _dense_fit(self, X): @@ -2101,7 +2099,19 @@ def _dense_transform(self, X, direction=True): func_transform = self.f_inverse_transform_ for feature_idx, f in enumerate(func_transform): + # older version of scipy do not handle tuple as fill_value + # clipping the value before transform solve the issue + if not direction: + np.clip(Xt[:, feature_idx], min(self.references_), + max(self.references_), out=Xt[:, feature_idx]) + else: + np.clip(Xt[:, feature_idx], min(self.quantiles_[feature_idx]), + max(self.quantiles_[feature_idx]), + out=Xt[:, feature_idx]) Xt[:, feature_idx] = f(Xt[:, feature_idx]) + # FIXME: earlier version of scipy through nan when x_min is passed + # New one just has float precision problem + Xt[:, feature_idx][np.isnan(Xt[:, feature_idx])] = 0.0 return Xt @@ -2132,7 +2142,20 @@ def _sparse_transform(self, X, direction=True): for feature_idx, f in enumerate(func_transform): column_slice = slice(Xt.indptr[feature_idx], Xt.indptr[feature_idx + 1]) + # older version of scipy do not handle tuple as fill_value + # clipping the value before transform solve the issue + if not direction: + np.clip(Xt.data[column_slice], min(self.references_), + max(self.references_), out=Xt.data[column_slice]) + else: + np.clip(Xt.data[column_slice], + min(self.quantiles_[feature_idx]), + max(self.quantiles_[feature_idx]), + out=Xt.data[column_slice]) Xt.data[column_slice] = f(Xt.data[column_slice]) + # FIXME: earlier version of scipy through nan when x_min is passed + # New one just has float precision problem + Xt.data[column_slice][np.isnan(Xt.data[column_slice])] = 0.0 return Xt From f552529df813df0ac4f9627a4d408c2ec1f3c250 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 16 Feb 2017 16:20:18 +0100 Subject: [PATCH 023/106] simplify sampling --- sklearn/preprocessing/data.py | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 1b75674f5d22c..cb2ff7e826aff 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -2026,11 +2026,6 @@ def _sparse_fit(self, X): rng = check_random_state(self.random_state) n_samples, n_features = X.get_shape() - if self.subsample < n_samples: - subsample_idx = rng.choice(n_samples, self.subsample, - replace=False) - X_csr = X.tocsr()[subsample_idx] - X = X_csr.tocsc() # for compatibility issue with numpy<=1.8.X, references_ # need to be a list @@ -2038,11 +2033,23 @@ def _sparse_fit(self, X): endpoint=True).tolist() # FIXME: it does not take into account the zero in the computation # references_ is a list that we need to scale between - # 0 and 100. `map` is used for that purpose. - self.quantiles_ = [np.percentile( - X.data[X.indptr[feature_idx]:X.indptr[feature_idx + 1]], - [x * 100 for x in self.references_]) - for feature_idx in range(n_features)] + # 0 and 100. + self.quantiles_ = [] + for feature_idx in range(n_features): + column_nnz_data = X.data[X.indptr[feature_idx]: + X.indptr[feature_idx + 1]] + if len(column_nnz_data) > self.subsample: + column_subsample = (self.subsample * len(column_nnz_data) // + n_samples) + column_data = np.zeros(shape=self.subsample, dtype=X.dtype) + column_data[:column_subsample] = rng.choice(column_nnz_data, + column_subsample, + replace=False) + else: + column_data = column_nnz_data + self.quantiles_.append( + np.percentile(column_data, + [x * 100 for x in self.references_])) def fit(self, X, y=None): """Compute the quantiles used for normalizing. From 8a4592c6c210706230daffac23ff6dd8c66d836a Mon Sep 17 00:00:00 2001 From: Thierry Guillemot Date: Thu, 16 Feb 2017 16:42:47 +0100 Subject: [PATCH 024/106] Fix tests. --- sklearn/preprocessing/data.py | 47 ++++++++++++++++++----------------- 1 file changed, 24 insertions(+), 23 deletions(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index cb2ff7e826aff..afd9f5d400621 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -2096,10 +2096,9 @@ def _dense_transform(self, X, direction=True): Returns ------- - Xt : ndarray, shape (n_samples, n_features) + X : ndarray, shape (n_samples, n_features) Projected data. """ - Xt = X.copy() if direction: func_transform = self.f_transform_ else: @@ -2108,19 +2107,21 @@ def _dense_transform(self, X, direction=True): for feature_idx, f in enumerate(func_transform): # older version of scipy do not handle tuple as fill_value # clipping the value before transform solve the issue - if not direction: - np.clip(Xt[:, feature_idx], min(self.references_), - max(self.references_), out=Xt[:, feature_idx]) - else: - np.clip(Xt[:, feature_idx], min(self.quantiles_[feature_idx]), + if direction: + np.clip(X[:, feature_idx], min(self.quantiles_[feature_idx]), max(self.quantiles_[feature_idx]), - out=Xt[:, feature_idx]) - Xt[:, feature_idx] = f(Xt[:, feature_idx]) + out=X[:, feature_idx]) + else: + np.clip(X[:, feature_idx], min(self.references_), + max(self.references_), out=X[:, feature_idx]) + print(X[:, feature_idx]) + X[:, feature_idx] = f(X[:, feature_idx]) + print(X[:, feature_idx]) # FIXME: earlier version of scipy through nan when x_min is passed # New one just has float precision problem - Xt[:, feature_idx][np.isnan(Xt[:, feature_idx])] = 0.0 + # X[:, feature_idx][np.isnan(X[:, feature_idx])] = 0.0 - return Xt + return X def _sparse_transform(self, X, direction=True): """Forward and inverse transform for sparse matrices. @@ -2137,34 +2138,33 @@ def _sparse_transform(self, X, direction=True): Returns ------- - Xt : sparse matrix CSC, shape (n_samples, n_features) + X : sparse matrix CSC, shape (n_samples, n_features) Projected data. """ - Xt = X.copy() if direction: func_transform = self.f_transform_ else: func_transform = self.f_inverse_transform_ for feature_idx, f in enumerate(func_transform): - column_slice = slice(Xt.indptr[feature_idx], - Xt.indptr[feature_idx + 1]) + column_slice = slice(X.indptr[feature_idx], + X.indptr[feature_idx + 1]) # older version of scipy do not handle tuple as fill_value # clipping the value before transform solve the issue if not direction: - np.clip(Xt.data[column_slice], min(self.references_), - max(self.references_), out=Xt.data[column_slice]) + np.clip(X.data[column_slice], min(self.references_), + max(self.references_), out=X.data[column_slice]) else: - np.clip(Xt.data[column_slice], + np.clip(X.data[column_slice], min(self.quantiles_[feature_idx]), max(self.quantiles_[feature_idx]), - out=Xt.data[column_slice]) - Xt.data[column_slice] = f(Xt.data[column_slice]) + out=X.data[column_slice]) + X.data[column_slice] = f(X.data[column_slice]) # FIXME: earlier version of scipy through nan when x_min is passed # New one just has float precision problem - Xt.data[column_slice][np.isnan(Xt.data[column_slice])] = 0.0 + X.data[column_slice][np.isnan(X.data[column_slice])] = 0.0 - return Xt + return X def transform(self, X): """Feature-wise normalization of the data. @@ -2182,7 +2182,8 @@ def transform(self, X): Xt : ndarray or sparse matrix, shape (n_samples, n_features) The projected data. """ - X = check_array(X, accept_sparse='csc') + X = check_array(X, accept_sparse='csc', copy=True, + dtype=[np.float64, np.float32]) # we only accept positive sparse matrix if sparse.issparse(X) and np.any(X.data < 0): raise ValueError('QuantileNormalizer only accepts non-negative' From 9070871974ea47207f87e05a0f8dfac1d9059694 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 16 Feb 2017 17:38:11 +0100 Subject: [PATCH 025/106] removing last pring --- sklearn/preprocessing/data.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index afd9f5d400621..a521570a05773 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -2114,12 +2114,10 @@ def _dense_transform(self, X, direction=True): else: np.clip(X[:, feature_idx], min(self.references_), max(self.references_), out=X[:, feature_idx]) - print(X[:, feature_idx]) X[:, feature_idx] = f(X[:, feature_idx]) - print(X[:, feature_idx]) # FIXME: earlier version of scipy through nan when x_min is passed # New one just has float precision problem - # X[:, feature_idx][np.isnan(X[:, feature_idx])] = 0.0 + X[:, feature_idx][np.isnan(X[:, feature_idx])] = 0.0 return X From cbe4da96161cec11dfb50d5d1a6844c8612fb4c2 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 16 Feb 2017 18:57:09 +0100 Subject: [PATCH 026/106] Change choice for permutation --- sklearn/preprocessing/data.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index a521570a05773..fb80afa76200f 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -2041,12 +2041,13 @@ def _sparse_fit(self, X): if len(column_nnz_data) > self.subsample: column_subsample = (self.subsample * len(column_nnz_data) // n_samples) + column_idx = rng.permutation(range(len(column_nnz_data))) column_data = np.zeros(shape=self.subsample, dtype=X.dtype) - column_data[:column_subsample] = rng.choice(column_nnz_data, - column_subsample, - replace=False) + column_data[:column_subsample] = column_nnz_data[ + column_idx[:column_subsample]] else: - column_data = column_nnz_data + column_data = np.zeros(shape=n_samples, dtype=X.dtype) + column_data[:len(column_nnz_data)] = column_nnz_data self.quantiles_.append( np.percentile(column_data, [x * 100 for x in self.references_])) From 1051fbb4e89e88148e95f529c37b1386d229b612 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 16 Feb 2017 19:01:21 +0100 Subject: [PATCH 027/106] cosmetics --- sklearn/preprocessing/data.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index fb80afa76200f..34f4821added9 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -2005,7 +2005,7 @@ def _dense_fit(self, X): subsample_idx = range(n_samples) # for compatibility issue with numpy<=1.8.X, references_ - # need to be a list + # need to be a list scaled between 0 and 100 self.references_ = np.linspace(0, 1, self.n_quantiles, endpoint=True).tolist() # references_ is a list that we need to scale between @@ -2041,6 +2041,8 @@ def _sparse_fit(self, X): if len(column_nnz_data) > self.subsample: column_subsample = (self.subsample * len(column_nnz_data) // n_samples) + # choice is not available in numpy <= 1.7 + # used permutation instead. column_idx = rng.permutation(range(len(column_nnz_data))) column_data = np.zeros(shape=self.subsample, dtype=X.dtype) column_data[:column_subsample] = column_nnz_data[ From 790b0cbdc34d50a101c34ff8bd73646d38ce6c2f Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 16 Feb 2017 19:40:12 +0100 Subject: [PATCH 028/106] fix remove remaining choice --- sklearn/preprocessing/data.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 34f4821added9..921b2015284e4 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -1999,8 +1999,7 @@ def _dense_fit(self, X): # subsample the matrix X if necessary n_samples, n_features = X.shape if self.subsample < n_samples: - subsample_idx = rng.choice(n_samples, self.subsample, - replace=False) + subsample_idx = rng.permutation(range(n_samples))[:self.subsample] else: subsample_idx = range(n_samples) From 971308906ebaedf0e8483a2bb57d72b4fc5886c0 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 16 Feb 2017 19:42:48 +0100 Subject: [PATCH 029/106] DOC --- sklearn/preprocessing/data.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 921b2015284e4..5ff38fe2d7ea8 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -1914,15 +1914,16 @@ class QuantileNormalizer(BaseEstimator, TransformerMixin): This Normalizer scales the features between 0 and 1, equalizing the distribution of each feature to a uniform distribution. Therefore, for a given feature, this normalization tends to spread out the most - frequent values. + frequent values. It also reduces the impact of (marginal) outliers: + this is therefore a robust preprocessing scheme. The normalization is applied on each feature independently. The cumulative density function of a feature is used to project the original values. Features values of new/unseen data that fall below or above the fitted range will be mapped to 0 and 1, respectively. - Note that this transform is non-linear. It may remove correlations between - variables measured at the same scale but renders variables measured at - different scales more directly comparable. + Note that this transform is non-linear. It may distort linear correlations + between variables measured at the same scale but renders variables measured + at different scales more directly comparable. Parameters ---------- From a1052de7c9aadc48b741da4dfec805a4bb4847c4 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 16 Feb 2017 22:44:24 +0100 Subject: [PATCH 030/106] Fix inconsistencies --- sklearn/preprocessing/data.py | 97 ++++++++++++------------ sklearn/preprocessing/tests/test_data.py | 10 +++ 2 files changed, 57 insertions(+), 50 deletions(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 5ff38fe2d7ea8..0de8ec8d2dffa 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -1942,19 +1942,9 @@ class QuantileNormalizer(BaseEstimator, TransformerMixin): Attributes ---------- - references_ : ndarray, shape (n_quantiles,) - The quantiles of reference. - quantiles_ : ndarray, shape (n_quantiles, n_features) The values corresponding the quantiles of reference. - f_transform_ : list of callable, shape (n_quantiles,) - The cumulative density function used to project the data. - - f_inverse_transform_ : list of callable, shape (n_quantiles,) - The inverse of the cumulative density function used to project the - data. - See also -------- :class:`sklearn.preprocessing.StandardScaler` to perform standardization @@ -1975,17 +1965,19 @@ def _build_f(self): """Build the transform functions.""" check_is_fitted(self, 'quantiles_') - self.f_transform_ = tuple([ - interp1d(quantiles_feature, self.references_, + references = np.linspace(0, 1, self.n_quantiles, endpoint=True) + + self._f_transform = tuple([ + interp1d(quantiles_feature, references, bounds_error=False, fill_value=0.) - for quantiles_feature in self.quantiles_]) + for quantiles_feature in self.quantiles_.T]) - self.f_inverse_transform_ = tuple([ - interp1d(self.references_, quantiles_feature, + self._f_inverse_transform = tuple([ + interp1d(references, quantiles_feature, bounds_error=False, fill_value=0.) - for quantiles_feature in self.quantiles_]) + for quantiles_feature in self.quantiles_.T]) def _dense_fit(self, X): """Compute percentiles for dense matrices. @@ -2004,15 +1996,15 @@ def _dense_fit(self, X): else: subsample_idx = range(n_samples) - # for compatibility issue with numpy<=1.8.X, references_ + # for compatibility issue with numpy<=1.8.X, references # need to be a list scaled between 0 and 100 - self.references_ = np.linspace(0, 1, self.n_quantiles, - endpoint=True).tolist() - # references_ is a list that we need to scale between + references = np.linspace(0, 1, self.n_quantiles, + endpoint=True).tolist() + # references is a list that we need to scale between # 0 and 100. - self.quantiles_ = [np.percentile(X[subsample_idx, feature_idx], - [x * 100 for x in self.references_]) - for feature_idx in range(n_features)] + self.quantiles_ = np.array([np.percentile( + X[subsample_idx,feature_idx], [x * 100 for x in references]) + for feature_idx in range(n_features)]).T def _sparse_fit(self, X): """Compute percentiles for sparse matrices. @@ -2027,12 +2019,11 @@ def _sparse_fit(self, X): n_samples, n_features = X.get_shape() - # for compatibility issue with numpy<=1.8.X, references_ + # for compatibility issue with numpy<=1.8.X, references # need to be a list - self.references_ = np.linspace(0, 1, self.n_quantiles, - endpoint=True).tolist() - # FIXME: it does not take into account the zero in the computation - # references_ is a list that we need to scale between + references = np.linspace(0, 1, self.n_quantiles, + endpoint=True).tolist() + # references is a list that we need to scale between # 0 and 100. self.quantiles_ = [] for feature_idx in range(n_features): @@ -2052,7 +2043,8 @@ def _sparse_fit(self, X): column_data[:len(column_nnz_data)] = column_nnz_data self.quantiles_.append( np.percentile(column_data, - [x * 100 for x in self.references_])) + [x * 100 for x in references])) + self.quantiles_ = np.array(self.quantiles_).T def fit(self, X, y=None): """Compute the quantiles used for normalizing. @@ -2103,20 +2095,23 @@ def _dense_transform(self, X, direction=True): Projected data. """ if direction: - func_transform = self.f_transform_ + func_transform = self._f_transform else: - func_transform = self.f_inverse_transform_ + func_transform = self._f_inverse_transform + + references = np.linspace(0, 1, self.n_quantiles, endpoint=True) for feature_idx, f in enumerate(func_transform): # older version of scipy do not handle tuple as fill_value # clipping the value before transform solve the issue if direction: - np.clip(X[:, feature_idx], min(self.quantiles_[feature_idx]), - max(self.quantiles_[feature_idx]), + np.clip(X[:, feature_idx], + min(self.quantiles_[:, feature_idx]), + max(self.quantiles_[:, feature_idx]), out=X[:, feature_idx]) else: - np.clip(X[:, feature_idx], min(self.references_), - max(self.references_), out=X[:, feature_idx]) + np.clip(X[:, feature_idx], min(references), + max(references), out=X[:, feature_idx]) X[:, feature_idx] = f(X[:, feature_idx]) # FIXME: earlier version of scipy through nan when x_min is passed # New one just has float precision problem @@ -2143,9 +2138,11 @@ def _sparse_transform(self, X, direction=True): Projected data. """ if direction: - func_transform = self.f_transform_ + func_transform = self._f_transform else: - func_transform = self.f_inverse_transform_ + func_transform = self._f_inverse_transform + + references = np.linspace(0, 1, self.n_quantiles, endpoint=True) for feature_idx, f in enumerate(func_transform): column_slice = slice(X.indptr[feature_idx], @@ -2153,12 +2150,12 @@ def _sparse_transform(self, X, direction=True): # older version of scipy do not handle tuple as fill_value # clipping the value before transform solve the issue if not direction: - np.clip(X.data[column_slice], min(self.references_), - max(self.references_), out=X.data[column_slice]) + np.clip(X.data[column_slice], min(references), + max(references), out=X.data[column_slice]) else: np.clip(X.data[column_slice], - min(self.quantiles_[feature_idx]), - max(self.quantiles_[feature_idx]), + min(self.quantiles_[:, feature_idx]), + max(self.quantiles_[:, feature_idx]), out=X.data[column_slice]) X.data[column_slice] = f(X.data[column_slice]) # FIXME: earlier version of scipy through nan when x_min is passed @@ -2189,12 +2186,12 @@ def transform(self, X): if sparse.issparse(X) and np.any(X.data < 0): raise ValueError('QuantileNormalizer only accepts non-negative' ' sparse matrices') - check_is_fitted(self, 'f_transform_') + check_is_fitted(self, '_f_transform') # check that the dimension of X are adequate with the fitted data - if X.shape[1] != len(self.f_transform_): + if X.shape[1] != len(self._f_transform): raise ValueError('X does not have the same number of feature than' ' the previously fitted data. Got {} instead of' - ' {}'.format(X.shape[1], len(self.f_transform_))) + ' {}'.format(X.shape[1], len(self._f_transform))) if sparse.issparse(X): return self._sparse_transform(X, True) else: @@ -2219,13 +2216,13 @@ def inverse_transform(self, X): if sparse.issparse(X) and np.any(X.data < 0): raise ValueError('QuantileNormalizer only accepts non-negative' ' sparse matrices') - check_is_fitted(self, 'f_inverse_transform_') + check_is_fitted(self, '_f_inverse_transform') # check that the dimension of X are adequate with the fitted data - if X.shape[1] != len(self.f_inverse_transform_): + if X.shape[1] != len(self._f_inverse_transform): raise ValueError('X does not have the same number of feature than' ' the previously fitted data. Got {} instead of' ' {}'.format(X.shape[1], - len(self.f_inverse_transform_))) + len(self._f_inverse_transform))) if sparse.issparse(X): return self._sparse_transform(X, False) else: @@ -2235,8 +2232,8 @@ def __getstate__(self): """Pickle-protocol - return state of the estimator. """ state = super(QuantileNormalizer, self).__getstate__() # remove interpolation method - state.pop('f_transform_', None) - state.pop('f_inverse_transform_', None) + state.pop('_f_transform', None) + state.pop('_f_inverse_transform', None) return state def __setstate__(self, state): @@ -2244,7 +2241,7 @@ def __setstate__(self, state): We need to rebuild the interpolation function. """ super(QuantileNormalizer, self).__setstate__(state) - if hasattr(self, 'references_') and hasattr(self, 'quantiles_'): + if hasattr(self, 'quantiles_'): self._build_f() diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index add0933b32dfd..bae3e6e22ef62 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -934,6 +934,7 @@ def test_quantile_normalizer_sparse_toy(): X = np.array([[0, 25, 50, 0, 0, 0, 75, 0, 0, 100], [2, 4, 0, 0, 6, 8, 0, 10, 0, 0], [0, 0, 2.6, 4.1, 0, 0, 2.3, 0, 9.5, 0.1]]).T + X = sparse.csc_matrix(X) normalizer = QuantileNormalizer() @@ -946,6 +947,15 @@ def test_quantile_normalizer_sparse_toy(): X_trans_inv = normalizer.inverse_transform(X_trans) assert_array_almost_equal(X.toarray(), X_trans_inv.toarray()) + normalizer_dense = QuantileNormalizer().fit(X.toarray()) + + X_trans = normalizer_dense.transform(X) + assert_array_almost_equal(np.min(X_trans.toarray(), axis=0), 0.) + assert_array_almost_equal(np.max(X_trans.toarray(), axis=0), 1.) + + X_trans_inv = normalizer_dense.inverse_transform(X_trans) + assert_array_almost_equal(X.toarray(), X_trans_inv.toarray()) + # test subsampling # FIXME: there is not comparison for the moment random_state = 42 From 5b48b2244f31a522c14874458b78282228926698 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 16 Feb 2017 23:06:25 +0100 Subject: [PATCH 031/106] pep8 --- sklearn/preprocessing/data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 0de8ec8d2dffa..383444aa96021 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -2003,7 +2003,7 @@ def _dense_fit(self, X): # references is a list that we need to scale between # 0 and 100. self.quantiles_ = np.array([np.percentile( - X[subsample_idx,feature_idx], [x * 100 for x in references]) + X[subsample_idx, feature_idx], [x * 100 for x in references]) for feature_idx in range(n_features)]).T def _sparse_fit(self, X): From 45172faa99023e3224b2479ea36fb18737d29ab4 Mon Sep 17 00:00:00 2001 From: Thierry Guillemot Date: Fri, 17 Feb 2017 10:10:39 +0100 Subject: [PATCH 032/106] Add checker for init parameters. --- sklearn/preprocessing/data.py | 11 +++++++++++ sklearn/preprocessing/tests/test_data.py | 5 +++++ 2 files changed, 16 insertions(+) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 383444aa96021..857ce2f90b21c 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -2063,6 +2063,17 @@ def fit(self, X, y=None): Returns self """ X = check_array(X, accept_sparse='csc') + + if self.n_quantiles <= 0: + raise ValueError("Invalid value for 'n_quantiles': %d. " + "The number of quantiles must be at least one." + % self.n_quantiles) + + if self.subsample <= 0: + raise ValueError("Invalid value for 'subsample': %d. " + "The number of quantiles must be at least one." + % self.subsample) + # we only accept positive sparse matrix if sparse.issparse(X) and np.any(X.data < 0): raise ValueError('QuantileNormalizer only accepts non-negative' diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index bae3e6e22ef62..3c5f31fc500fe 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -875,6 +875,11 @@ def test_quantile_normalizer_check_error(): [0, 0, 2.6, 4.1, 0, 0, 2.3, 0, 9.5, 0.1]]).T X_neg = sparse.csc_matrix(X_neg) + assert_raises_regex(ValueError, "Invalid value for 'n_quantiles'", + QuantileNormalizer(n_quantiles=0).fit, X_neg) + assert_raises_regex(ValueError, "Invalid value for 'subsample'", + QuantileNormalizer(subsample=0).fit, X_neg) + normalizer = QuantileNormalizer() assert_raises_regex(ValueError, "QuantileNormalizer only accepts " "non-negative sparse matrices", normalizer.fit, X_neg) From ef3b403ebccef8cdf5ac14757c2a8022b367eef7 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 17 Feb 2017 11:21:22 +0100 Subject: [PATCH 033/106] hack bounds and make a test --- sklearn/preprocessing/data.py | 59 ++++++++++++++++-------- sklearn/preprocessing/tests/test_data.py | 18 +++++++- 2 files changed, 55 insertions(+), 22 deletions(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 857ce2f90b21c..2bd78f46a36c4 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -31,6 +31,8 @@ from ..utils.validation import (check_is_fitted, check_random_state, FLOAT_DTYPES) +BOUNDS_THRESHOLD = 1e-7 + zip = six.moves.zip map = six.moves.map @@ -2117,17 +2119,27 @@ def _dense_transform(self, X, direction=True): # clipping the value before transform solve the issue if direction: np.clip(X[:, feature_idx], - min(self.quantiles_[:, feature_idx]), - max(self.quantiles_[:, feature_idx]), + self.quantiles_[0, feature_idx], + self.quantiles_[-1, feature_idx], out=X[:, feature_idx]) else: - np.clip(X[:, feature_idx], min(references), - max(references), out=X[:, feature_idx]) - X[:, feature_idx] = f(X[:, feature_idx]) - # FIXME: earlier version of scipy through nan when x_min is passed - # New one just has float precision problem - X[:, feature_idx][np.isnan(X[:, feature_idx])] = 0.0 - + np.clip(X[:, feature_idx], references[0], + references[-1], out=X[:, feature_idx]) + # Avoid computing for bounds due to numerical error of interp1d + lower_bounds_idx = (X[:, feature_idx] - BOUNDS_THRESHOLD < + min(X[:, feature_idx])) + upper_bounds_idx = (X[:, feature_idx] + BOUNDS_THRESHOLD > + max(X[:, feature_idx])) + bounds_idx = np.bitwise_or(lower_bounds_idx, upper_bounds_idx) + X[~bounds_idx, feature_idx] = f(X[~bounds_idx, feature_idx]) + if direction: + X[upper_bounds_idx, feature_idx] = references[-1] + X[lower_bounds_idx, feature_idx] = references[0] + else: + X[upper_bounds_idx, feature_idx] = self.quantiles_[ + -1, feature_idx] + X[lower_bounds_idx, feature_idx] = self.quantiles_[ + 0, feature_idx] return X def _sparse_transform(self, X, direction=True): @@ -2160,19 +2172,26 @@ def _sparse_transform(self, X, direction=True): X.indptr[feature_idx + 1]) # older version of scipy do not handle tuple as fill_value # clipping the value before transform solve the issue - if not direction: - np.clip(X.data[column_slice], min(references), - max(references), out=X.data[column_slice]) - else: + if direction: np.clip(X.data[column_slice], - min(self.quantiles_[:, feature_idx]), - max(self.quantiles_[:, feature_idx]), + self.quantiles_[0, feature_idx], + self.quantiles_[-1, feature_idx], out=X.data[column_slice]) - X.data[column_slice] = f(X.data[column_slice]) - # FIXME: earlier version of scipy through nan when x_min is passed - # New one just has float precision problem - X.data[column_slice][np.isnan(X.data[column_slice])] = 0.0 - + else: + np.clip(X.data[column_slice], references[0], + references[-1], out=X.data[column_slice]) + # Avoid computing for bounds due to numerical error of interp1d + # Check that there is value + if X.data[column_slice].size: + upper_bounds_idx = (X.data[column_slice] + BOUNDS_THRESHOLD > + max(X.data[column_slice])) + X.data[column_slice][~upper_bounds_idx] = f( + X.data[column_slice][~upper_bounds_idx]) + if direction: + X.data[column_slice][upper_bounds_idx] = references[-1] + else: + X.data[column_slice][upper_bounds_idx] = self.quantiles_[ + -1, feature_idx] return X def transform(self, X): diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index 3c5f31fc500fe..75c54c4e5f241 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -859,8 +859,8 @@ def test_quantile_normalizer_iris(): X_trans = normalizer.fit_transform(X) # FIXME: one of those will drive to precision error # in the interpolation - # assert_array_almost_equal(np.min(X_trans, axis=0), 0.) - # assert_array_almost_equal(np.max(X_trans, axis=0), 1.) + assert_array_almost_equal(np.min(X_trans, axis=0), 0.) + assert_array_almost_equal(np.max(X_trans, axis=0), 1.) X_trans_inv = normalizer.inverse_transform(X_trans) assert_array_almost_equal(X, X_trans_inv) @@ -980,6 +980,20 @@ def test_quantile_normalize_axis1(): assert_array_almost_equal(X_trans_a0, X_trans_a1.T) +def test_qunatile_normalzer_bounds(): + X_dense = np.array([[0, 0], + [0, 0], + [1, 0]]) + X_sparse = sparse.csc_matrix(X_dense) + + X_trans = QuantileNormalizer().fit_transform(X_dense) + assert_array_almost_equal(X_trans, X_dense) + + X_trans_sp = QuantileNormalizer().fit_transform(X_sparse) + assert_array_almost_equal(X_trans_sp.A, X_dense) + + assert_array_almost_equal(X_trans, X_trans_sp.A) + def test_robust_scaler_invalid_range(): for range_ in [ (-1, 90), From adc1f37b4e0f0d1718bdb64dc5e70fad95d7f1f1 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 17 Feb 2017 15:00:00 +0100 Subject: [PATCH 034/106] FIX/TST bounds are provided by the fitting and not X at transform --- sklearn/preprocessing/data.py | 48 ++++++++++-------------- sklearn/preprocessing/tests/test_data.py | 13 ++++++- 2 files changed, 31 insertions(+), 30 deletions(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 2bd78f46a36c4..4a92523dc0be8 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -2118,28 +2118,25 @@ def _dense_transform(self, X, direction=True): # older version of scipy do not handle tuple as fill_value # clipping the value before transform solve the issue if direction: - np.clip(X[:, feature_idx], - self.quantiles_[0, feature_idx], - self.quantiles_[-1, feature_idx], - out=X[:, feature_idx]) + lower_bound_x = self.quantiles_[0, feature_idx] + upper_bound_x = self.quantiles_[-1, feature_idx] + lower_bound_y = references[0] + upper_bound_y = references[-1] else: - np.clip(X[:, feature_idx], references[0], - references[-1], out=X[:, feature_idx]) + lower_bound_x = references[0] + upper_bound_x = references[-1] + lower_bound_y = self.quantiles_[0, feature_idx] + upper_bound_y = self.quantiles_[-1, feature_idx] # Avoid computing for bounds due to numerical error of interp1d lower_bounds_idx = (X[:, feature_idx] - BOUNDS_THRESHOLD < - min(X[:, feature_idx])) + lower_bound_x) upper_bounds_idx = (X[:, feature_idx] + BOUNDS_THRESHOLD > - max(X[:, feature_idx])) + upper_bound_x) bounds_idx = np.bitwise_or(lower_bounds_idx, upper_bounds_idx) X[~bounds_idx, feature_idx] = f(X[~bounds_idx, feature_idx]) - if direction: - X[upper_bounds_idx, feature_idx] = references[-1] - X[lower_bounds_idx, feature_idx] = references[0] - else: - X[upper_bounds_idx, feature_idx] = self.quantiles_[ - -1, feature_idx] - X[lower_bounds_idx, feature_idx] = self.quantiles_[ - 0, feature_idx] + X[upper_bounds_idx, feature_idx] = upper_bound_y + X[lower_bounds_idx, feature_idx] = lower_bound_y + return X def _sparse_transform(self, X, direction=True): @@ -2173,25 +2170,20 @@ def _sparse_transform(self, X, direction=True): # older version of scipy do not handle tuple as fill_value # clipping the value before transform solve the issue if direction: - np.clip(X.data[column_slice], - self.quantiles_[0, feature_idx], - self.quantiles_[-1, feature_idx], - out=X.data[column_slice]) + upper_bound_x = self.quantiles_[-1, feature_idx] + upper_bound_y = references[-1] else: - np.clip(X.data[column_slice], references[0], - references[-1], out=X.data[column_slice]) + upper_bound_x = references[-1] + upper_bound_y = self.quantiles_[-1, feature_idx] # Avoid computing for bounds due to numerical error of interp1d # Check that there is value if X.data[column_slice].size: upper_bounds_idx = (X.data[column_slice] + BOUNDS_THRESHOLD > - max(X.data[column_slice])) + upper_bound_x) X.data[column_slice][~upper_bounds_idx] = f( X.data[column_slice][~upper_bounds_idx]) - if direction: - X.data[column_slice][upper_bounds_idx] = references[-1] - else: - X.data[column_slice][upper_bounds_idx] = self.quantiles_[ - -1, feature_idx] + X.data[column_slice][upper_bounds_idx] = upper_bound_y + return X def transform(self, X): diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index 75c54c4e5f241..d779a3e49169a 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -986,14 +986,23 @@ def test_qunatile_normalzer_bounds(): [1, 0]]) X_sparse = sparse.csc_matrix(X_dense) + # check sparse and dense are consistent X_trans = QuantileNormalizer().fit_transform(X_dense) assert_array_almost_equal(X_trans, X_dense) - X_trans_sp = QuantileNormalizer().fit_transform(X_sparse) assert_array_almost_equal(X_trans_sp.A, X_dense) - assert_array_almost_equal(X_trans, X_trans_sp.A) + # check the consistency of the bounds by learning on 1 matrix + # and transforming another + X = np.array([[0, 0, 1], + [1, 0.5, 0]]).T + X1 = np.array([[0, 0, 1], + [0.1, 0.5, 0.1]]).T + qn = QuantileNormalizer().fit(X) + X_trans = qn.transform(X1) + assert_array_almost_equal(X_trans, X1) + def test_robust_scaler_invalid_range(): for range_ in [ (-1, 90), From 22ea4f9e143acba81bd96290cad21bd94c8d8bab Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 17 Feb 2017 15:00:26 +0100 Subject: [PATCH 035/106] PEP8 --- sklearn/preprocessing/tests/test_data.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index d779a3e49169a..a76ac8f41e0da 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -1003,6 +1003,7 @@ def test_qunatile_normalzer_bounds(): X_trans = qn.transform(X1) assert_array_almost_equal(X_trans, X1) + def test_robust_scaler_invalid_range(): for range_ in [ (-1, 90), From 81a3721e504495e22a9d529d729aeaa246feec90 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 17 Feb 2017 15:28:57 +0100 Subject: [PATCH 036/106] FIX/TST axis should be <= 1 --- sklearn/preprocessing/data.py | 5 +++- sklearn/preprocessing/tests/test_data.py | 34 ++++++++++++++++-------- 2 files changed, 27 insertions(+), 12 deletions(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 4a92523dc0be8..df861b9bf35d9 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -2273,5 +2273,8 @@ def quantile_normalize(X, axis=0, n_quantiles=1000, subsample=int(1e5), random_state=random_state) if axis == 0: return n.fit_transform(X) - else: + elif axis == 1: return n.fit_transform(X.T).T + else: + raise ValueError("axis should be either equal to 0 or 1. Got" + " axis={}".format( axis)) diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index a76ac8f41e0da..102c19a724882 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -1004,6 +1004,21 @@ def test_qunatile_normalzer_bounds(): assert_array_almost_equal(X_trans, X1) +def test_quantile_normalizer_pickling(): + qn = QuantileNormalizer(n_quantiles=100) + + qn_ser = pickle.dumps(qn, pickle.HIGHEST_PROTOCOL) + qn2 = pickle.loads(qn_ser) + assert_false(hasattr(qn2, 'f_transform_')) + assert_false(hasattr(qn2, 'f_inverse_transform_')) + + qn.fit(iris.data) + qn_ser = pickle.dumps(qn, pickle.HIGHEST_PROTOCOL) + qn2 = pickle.loads(qn_ser) + assert_array_almost_equal(qn.transform(iris.data), + qn2.transform(iris.data)) + + def test_robust_scaler_invalid_range(): for range_ in [ (-1, 90), @@ -1796,16 +1811,13 @@ def test_fit_cold_start(): scaler.fit_transform(X_2d) -def test_quantile_normalizer_pickling(): - qn = QuantileNormalizer(n_quantiles=100) +def test_function_valid_axis(): + X = np.array([[0, 25, 50, 75, 100], + [2, 4, 6, 8, 10], + [2.6, 4.1, 2.3, 9.5, 0.1]]) - qn_ser = pickle.dumps(qn, pickle.HIGHEST_PROTOCOL) - qn2 = pickle.loads(qn_ser) - assert_false(hasattr(qn2, 'f_transform_')) - assert_false(hasattr(qn2, 'f_inverse_transform_')) + func_list = [quantile_normalize] - qn.fit(iris.data) - qn_ser = pickle.dumps(qn, pickle.HIGHEST_PROTOCOL) - qn2 = pickle.loads(qn_ser) - assert_array_almost_equal(qn.transform(iris.data), - qn2.transform(iris.data)) + for func in func_list: + assert_raises_regex(ValueError, "axis should be either equal to 0 or 1" + ". Got axis=2", func, X.T, axis=2) From 055d8aadd9ec30c8860dfae723650841fbc2f0c7 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 17 Feb 2017 16:15:37 +0100 Subject: [PATCH 037/106] PEP8 --- sklearn/preprocessing/data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index df861b9bf35d9..21d83ce43fb7f 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -2277,4 +2277,4 @@ def quantile_normalize(X, axis=0, n_quantiles=1000, subsample=int(1e5), return n.fit_transform(X.T).T else: raise ValueError("axis should be either equal to 0 or 1. Got" - " axis={}".format( axis)) + " axis={}".format(axis)) From 777e35392e6007c40728e4865f5b05815ea496a1 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 21 Feb 2017 17:04:58 +0100 Subject: [PATCH 038/106] ENH Add parameter ignore_implicit_zeros --- sklearn/preprocessing/data.py | 45 ++++++++++++++++++++---- sklearn/preprocessing/tests/test_data.py | 37 +++++++++++++++++-- 2 files changed, 73 insertions(+), 9 deletions(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 21d83ce43fb7f..6d863b1ca79a4 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -1936,6 +1936,11 @@ class QuantileNormalizer(BaseEstimator, TransformerMixin): subsample : int, optional (default=1e5) Maximum number of samples used to estimate the quantiles. + ignore_implicit_zeros : bool, optional (default=False) + Apply only for sparse matrices. If True, the sparse entries of the + matrix are discarded to compute the quantile statistics. If false, + these entries are accounting for zeros. + random_state : int, RandomState instance or None, optional (default=None) If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; @@ -1958,15 +1963,20 @@ class QuantileNormalizer(BaseEstimator, TransformerMixin): """ def __init__(self, n_quantiles=1000, subsample=int(1e5), - random_state=None): + ignore_implicit_zeros=False, random_state=None): self.n_quantiles = n_quantiles self.subsample = subsample + self.ignore_implicit_zeros = ignore_implicit_zeros self.random_state = random_state def _build_f(self): """Build the transform functions.""" check_is_fitted(self, 'quantiles_') + if self.ignore_implicit_zeros: + warnings.warn("'ignore_implicit_zeros' takes effect only with" + " sparse matrix. This parameter has no effect.") + references = np.linspace(0, 1, self.n_quantiles, endpoint=True) self._f_transform = tuple([ @@ -2037,15 +2047,28 @@ def _sparse_fit(self, X): # choice is not available in numpy <= 1.7 # used permutation instead. column_idx = rng.permutation(range(len(column_nnz_data))) - column_data = np.zeros(shape=self.subsample, dtype=X.dtype) + if self.ignore_implicit_zeros: + column_data = np.zeros(shape=column_subsample, + dtype=X.dtype) + else: + column_data = np.zeros(shape=self.subsample, dtype=X.dtype) column_data[:column_subsample] = column_nnz_data[ column_idx[:column_subsample]] else: - column_data = np.zeros(shape=n_samples, dtype=X.dtype) + if self.ignore_implicit_zeros: + column_data = np.zeros(shape=len(column_nnz_data), + dtype=X.dtype) + else: + column_data = np.zeros(shape=n_samples, dtype=X.dtype) column_data[:len(column_nnz_data)] = column_nnz_data - self.quantiles_.append( - np.percentile(column_data, - [x * 100 for x in references])) + if not column_data.size: + # if no nnz, an error will be raised for computing the + # quantiles. Force the quantiles to be zeros. + self.quantiles_.append([0] * len(references)) + else: + self.quantiles_.append( + np.percentile(column_data, + [x * 100 for x in references])) self.quantiles_ = np.array(self.quantiles_).T def fit(self, X, y=None): @@ -2170,19 +2193,26 @@ def _sparse_transform(self, X, direction=True): # older version of scipy do not handle tuple as fill_value # clipping the value before transform solve the issue if direction: + lower_bound_x = self.quantiles_[0, feature_idx] upper_bound_x = self.quantiles_[-1, feature_idx] + lower_bound_y = references[0] upper_bound_y = references[-1] else: + lower_bound_x = references[0] upper_bound_x = references[-1] + lower_bound_y = self.quantiles_[0, feature_idx] upper_bound_y = self.quantiles_[-1, feature_idx] # Avoid computing for bounds due to numerical error of interp1d # Check that there is value if X.data[column_slice].size: + lower_bounds_idx = (X.data[column_slice] - BOUNDS_THRESHOLD < + lower_bound_x) upper_bounds_idx = (X.data[column_slice] + BOUNDS_THRESHOLD > upper_bound_x) X.data[column_slice][~upper_bounds_idx] = f( X.data[column_slice][~upper_bounds_idx]) X.data[column_slice][upper_bounds_idx] = upper_bound_y + X.data[column_slice][lower_bounds_idx] = lower_bound_y return X @@ -2268,8 +2298,9 @@ def __setstate__(self, state): def quantile_normalize(X, axis=0, n_quantiles=1000, subsample=int(1e5), - random_state=None): + ignore_implicit_zeros=False, random_state=None): n = QuantileNormalizer(n_quantiles=n_quantiles, subsample=subsample, + ignore_implicit_zeros=ignore_implicit_zeros, random_state=random_state) if axis == 0: return n.fit_transform(X) diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index 102c19a724882..75fc4ff338414 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -857,8 +857,6 @@ def test_quantile_normalizer_iris(): X = iris.data normalizer = QuantileNormalizer() X_trans = normalizer.fit_transform(X) - # FIXME: one of those will drive to precision error - # in the interpolation assert_array_almost_equal(np.min(X_trans, axis=0), 0.) assert_array_almost_equal(np.max(X_trans, axis=0), 1.) X_trans_inv = normalizer.inverse_transform(X_trans) @@ -901,6 +899,41 @@ def test_quantile_normalizer_check_error(): normalizer.inverse_transform, X_bad_feat) +def test_quantile_normalizer_ignore_zeros(): + X = np.array([[0, 0, 0, 0, 0], + [1, 0, 2, 2, 1]]).T + X_sparse = sparse.csc_matrix(X) + nq = QuantileNormalizer(ignore_implicit_zeros=True, n_quantiles=5) + + # dense case -> warning raise + assert_warns_message(UserWarning, "'ignore_implicit_zeros' takes effect" + " only with sparse matrix. This parameter has no" + " effect.", nq.fit, X) + + X_gt = np.array([[0, 0, 0, 0, 0], + [0, 0, 1, 1, 0]]).T + X_trans = nq.fit_transform(X_sparse) + assert_almost_equal(X_gt, X_trans.A) + + # consider the case where sparse entries are missing values and user-given + # zeros are to be considered + X_data = np.array([0, 0, 1, 0, 2, 2, 1, 0, 1, 2, 0]) + X_col = np.array([0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]) + X_row = np.array([0, 4, 0, 1, 2, 3, 4, 5, 6, 7, 8]) + X_sparse = sparse.csc_matrix((X_data, (X_row, X_col))) + X_trans = nq.fit_transform(X_sparse) + X_gt = np.array([[0., 0.5], + [0., 0.], + [0., 1.], + [0., 1.], + [0., 0.5], + [0., 0.], + [0., 0.5], + [0., 1.], + [0., 0.]]) + assert_almost_equal(X_gt, X_trans.A) + + def test_quantile_normalizer_dense_toy(): X = np.array([[0, 25, 50, 75, 100], [2, 4, 6, 8, 10], From 63708c2815cec414984c8a883dd79bd1f4503f0d Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 21 Feb 2017 18:11:04 +0100 Subject: [PATCH 039/106] ENH match output distribution --- sklearn/preprocessing/data.py | 35 ++++++++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 6d863b1ca79a4..ae8cdf435fbd2 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -16,6 +16,7 @@ import numpy as np from scipy import sparse from scipy.interpolate import interp1d +from scipy import stats from ..base import BaseEstimator, TransformerMixin from ..externals import six @@ -1941,6 +1942,10 @@ class QuantileNormalizer(BaseEstimator, TransformerMixin): matrix are discarded to compute the quantile statistics. If false, these entries are accounting for zeros. + output_pdf : scipy.stats.rv_continuous, optional (default=uniform) + Probability density function of the normalized data. It should be a + subclass of ``scipy.stats.rv_continuous``. + random_state : int, RandomState instance or None, optional (default=None) If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; @@ -1963,10 +1968,12 @@ class QuantileNormalizer(BaseEstimator, TransformerMixin): """ def __init__(self, n_quantiles=1000, subsample=int(1e5), - ignore_implicit_zeros=False, random_state=None): + ignore_implicit_zeros=False, output_pdf=stats.uniform, + random_state=None): self.n_quantiles = n_quantiles self.subsample = subsample self.ignore_implicit_zeros = ignore_implicit_zeros + self.output_pdf = output_pdf self.random_state = random_state def _build_f(self): @@ -2150,6 +2157,11 @@ def _dense_transform(self, X, direction=True): upper_bound_x = references[-1] lower_bound_y = self.quantiles_[0, feature_idx] upper_bound_y = self.quantiles_[-1, feature_idx] + if not direction: + # for inverse transform, match a uniform PDF + for i in range(X.shape[0]): + X[i, feature_idx] = self.output_pdf.cdf( + X[i, feature_idx]) # Avoid computing for bounds due to numerical error of interp1d lower_bounds_idx = (X[:, feature_idx] - BOUNDS_THRESHOLD < lower_bound_x) @@ -2159,6 +2171,11 @@ def _dense_transform(self, X, direction=True): X[~bounds_idx, feature_idx] = f(X[~bounds_idx, feature_idx]) X[upper_bounds_idx, feature_idx] = upper_bound_y X[lower_bounds_idx, feature_idx] = lower_bound_y + # for forward transform, match the output PDF + if direction: + for i in range(X.shape[0]): + X[i, feature_idx] = self.output_pdf.ppf( + X[i, feature_idx]) return X @@ -2202,6 +2219,12 @@ def _sparse_transform(self, X, direction=True): upper_bound_x = references[-1] lower_bound_y = self.quantiles_[0, feature_idx] upper_bound_y = self.quantiles_[-1, feature_idx] + # for inverse transform, match a uniform PDF + if not direction: + for i in range(X.data[column_slice].size): + X.data[column_slice][i] = self.output_pdf.cdf( + X.data[column_slice][i]) + # Avoid computing for bounds due to numerical error of interp1d # Check that there is value if X.data[column_slice].size: @@ -2213,6 +2236,11 @@ def _sparse_transform(self, X, direction=True): X.data[column_slice][~upper_bounds_idx]) X.data[column_slice][upper_bounds_idx] = upper_bound_y X.data[column_slice][lower_bounds_idx] = lower_bound_y + # for forward transform, match the output PDF + if direction: + for i in range(X.data[column_slice].size): + X.data[column_slice][i] = self.output_pdf.ppf( + X.data[column_slice][i]) return X @@ -2244,6 +2272,11 @@ def transform(self, X): raise ValueError('X does not have the same number of feature than' ' the previously fitted data. Got {} instead of' ' {}'.format(X.shape[1], len(self._f_transform))) + # check the output object + if not issubclass(type(self.output_pdf), stats.rv_continuous): + raise ValueError('output_pdf has to be a subclass of ' + 'scipy.stats.rv_continuous. Got {} ' + ' instead'.format(type(self.output_pdf))) if sparse.issparse(X): return self._sparse_transform(X, True) else: From 6e6eb5249a5b6d8687550dc5cdeedef8f3971972 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 21 Feb 2017 19:03:43 +0100 Subject: [PATCH 040/106] ENH clip the data to avoid infinity due to output PDF --- sklearn/preprocessing/data.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index ae8cdf435fbd2..42da958b17e74 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -2176,7 +2176,13 @@ def _dense_transform(self, X, direction=True): for i in range(X.shape[0]): X[i, feature_idx] = self.output_pdf.ppf( X[i, feature_idx]) - + # find the value to clip the data to avoid mapping to + # infinity. Clip such that the inverse transform will be + # consistent + clip_min = self.output_pdf.ppf(BOUNDS_THRESHOLD / 10) + clip_max = self.output_pdf.ppf(1 - (BOUNDS_THRESHOLD / 10)) + X[:, feature_idx] = np.clip(X[:, feature_idx], clip_min, + clip_max) return X def _sparse_transform(self, X, direction=True): @@ -2238,10 +2244,18 @@ def _sparse_transform(self, X, direction=True): X.data[column_slice][lower_bounds_idx] = lower_bound_y # for forward transform, match the output PDF if direction: + # find the value to clip the data to avoid mapping to + # infinity. Clip such that the inverse transform will be + # consistent. + clip_min = self.output_pdf.ppf(BOUNDS_THRESHOLD / 10) + clip_max = self.output_pdf.ppf(1 - (BOUNDS_THRESHOLD / 10)) for i in range(X.data[column_slice].size): X.data[column_slice][i] = self.output_pdf.ppf( X.data[column_slice][i]) - + if X.data[column_slice][i] > clip_max: + X.data[column_slice][i] = clip_max + elif X.data[column_slice][i] < clip_min: + X.data[column_slice][i] = clip_min return X def transform(self, X): From 1aba0fefb15ad7872a728e6aa640514d7020aa26 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 22 Feb 2017 13:33:36 +0100 Subject: [PATCH 041/106] FIX ENH restraint to uniform and norm --- sklearn/preprocessing/data.py | 42 +++++++++++++++++++++-------------- 1 file changed, 25 insertions(+), 17 deletions(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 42da958b17e74..899bff50c3b71 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -1942,9 +1942,9 @@ class QuantileNormalizer(BaseEstimator, TransformerMixin): matrix are discarded to compute the quantile statistics. If false, these entries are accounting for zeros. - output_pdf : scipy.stats.rv_continuous, optional (default=uniform) - Probability density function of the normalized data. It should be a - subclass of ``scipy.stats.rv_continuous``. + output_pdf : str, optional (default='norm') + Probability density function of the normalized data. The choices are + 'norm' (default) or 'uniform'. random_state : int, RandomState instance or None, optional (default=None) If int, random_state is the seed used by the random number generator; @@ -1968,7 +1968,7 @@ class QuantileNormalizer(BaseEstimator, TransformerMixin): """ def __init__(self, n_quantiles=1000, subsample=int(1e5), - ignore_implicit_zeros=False, output_pdf=stats.uniform, + ignore_implicit_zeros=False, output_pdf='uniform', random_state=None): self.n_quantiles = n_quantiles self.subsample = subsample @@ -2141,6 +2141,7 @@ def _dense_transform(self, X, direction=True): func_transform = self._f_transform else: func_transform = self._f_inverse_transform + class_pdf = getattr(stats, self.output_pdf) references = np.linspace(0, 1, self.n_quantiles, endpoint=True) @@ -2160,7 +2161,7 @@ def _dense_transform(self, X, direction=True): if not direction: # for inverse transform, match a uniform PDF for i in range(X.shape[0]): - X[i, feature_idx] = self.output_pdf.cdf( + X[i, feature_idx] = class_pdf.cdf( X[i, feature_idx]) # Avoid computing for bounds due to numerical error of interp1d lower_bounds_idx = (X[:, feature_idx] - BOUNDS_THRESHOLD < @@ -2174,13 +2175,13 @@ def _dense_transform(self, X, direction=True): # for forward transform, match the output PDF if direction: for i in range(X.shape[0]): - X[i, feature_idx] = self.output_pdf.ppf( + X[i, feature_idx] = class_pdf.ppf( X[i, feature_idx]) # find the value to clip the data to avoid mapping to # infinity. Clip such that the inverse transform will be # consistent - clip_min = self.output_pdf.ppf(BOUNDS_THRESHOLD / 10) - clip_max = self.output_pdf.ppf(1 - (BOUNDS_THRESHOLD / 10)) + clip_min = class_pdf.ppf(BOUNDS_THRESHOLD / 10) + clip_max = class_pdf.ppf(1 - (BOUNDS_THRESHOLD / 10)) X[:, feature_idx] = np.clip(X[:, feature_idx], clip_min, clip_max) return X @@ -2207,6 +2208,7 @@ def _sparse_transform(self, X, direction=True): func_transform = self._f_transform else: func_transform = self._f_inverse_transform + class_pdf = getattr(stats, self.output_pdf) references = np.linspace(0, 1, self.n_quantiles, endpoint=True) @@ -2228,7 +2230,7 @@ def _sparse_transform(self, X, direction=True): # for inverse transform, match a uniform PDF if not direction: for i in range(X.data[column_slice].size): - X.data[column_slice][i] = self.output_pdf.cdf( + X.data[column_slice][i] = class_pdf.cdf( X.data[column_slice][i]) # Avoid computing for bounds due to numerical error of interp1d @@ -2247,10 +2249,10 @@ def _sparse_transform(self, X, direction=True): # find the value to clip the data to avoid mapping to # infinity. Clip such that the inverse transform will be # consistent. - clip_min = self.output_pdf.ppf(BOUNDS_THRESHOLD / 10) - clip_max = self.output_pdf.ppf(1 - (BOUNDS_THRESHOLD / 10)) + clip_min = class_pdf.ppf(BOUNDS_THRESHOLD / 10) + clip_max = class_pdf.ppf(1 - (BOUNDS_THRESHOLD / 10)) for i in range(X.data[column_slice].size): - X.data[column_slice][i] = self.output_pdf.ppf( + X.data[column_slice][i] = class_pdf.ppf( X.data[column_slice][i]) if X.data[column_slice][i] > clip_max: X.data[column_slice][i] = clip_max @@ -2286,11 +2288,12 @@ def transform(self, X): raise ValueError('X does not have the same number of feature than' ' the previously fitted data. Got {} instead of' ' {}'.format(X.shape[1], len(self._f_transform))) - # check the output object - if not issubclass(type(self.output_pdf), stats.rv_continuous): - raise ValueError('output_pdf has to be a subclass of ' - 'scipy.stats.rv_continuous. Got {} ' - ' instead'.format(type(self.output_pdf))) + # check the output PDF + if self.output_pdf not in ('norm', 'uniform'): + raise ValueError("'output_pdf' has to be either 'norm' or" + " 'uniform'. Got {} instead.".format( + self.output_pdf)) + if sparse.issparse(X): return self._sparse_transform(X, True) else: @@ -2322,6 +2325,11 @@ def inverse_transform(self, X): ' the previously fitted data. Got {} instead of' ' {}'.format(X.shape[1], len(self._f_inverse_transform))) + # check the output PDF + if self.output_pdf not in ('norm', 'uniform'): + raise ValueError("'output_pdf' has to be either 'norm' or" + " 'uniform'. Got {} instead.".format( + self.output_pdf)) if sparse.issparse(X): return self._sparse_transform(X, False) else: From d1a94f582af9dce63f4c79c1e6e53cecfa8329f8 Mon Sep 17 00:00:00 2001 From: "(Venkat) Raghav (Rajagopalan)" Date: Wed, 22 Feb 2017 13:53:19 +0100 Subject: [PATCH 042/106] [MRG] ENH Add example comparing the distribution of all scaling preprocessor (#2) * ENH Add example comparing the distribution of all scaling preprocessor * Remove Jupyter notebook convert * FIX/ENH Select feat before not after; Plot interquantile data range for all * Add heatmap legend * Remove comment maybe? * Move doc from robust_scaling to plot_all_scaling; Need to update doc * Update the doc * Better aesthetics; Better spacing and plot colormap only at end * Shameless author re-ordering ;P * Use env python for she-bang --- examples/preprocessing/plot_all_scaling.py | 136 ++++++++++++++++++ examples/preprocessing/plot_robust_scaling.py | 84 ----------- 2 files changed, 136 insertions(+), 84 deletions(-) create mode 100644 examples/preprocessing/plot_all_scaling.py delete mode 100644 examples/preprocessing/plot_robust_scaling.py diff --git a/examples/preprocessing/plot_all_scaling.py b/examples/preprocessing/plot_all_scaling.py new file mode 100644 index 0000000000000..80e67570af0c8 --- /dev/null +++ b/examples/preprocessing/plot_all_scaling.py @@ -0,0 +1,136 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +""" +============================================================= +Compare the effect of different scalers on data with outliers +============================================================= + +The feature 0 and feature 5 of california housing dataset contains large +outliers that can make visualization of the data difficult. + +Also linear models like :class:`sklearn.linear_model.SVM` require data which is +approximately normalized to the [-1, 1] or [0, 1] range, or at the very least +have all the features on the same scale. + +This example uses different scalers and normalizers to bring the data within a +smaller range. +""" +from __future__ import print_function +print(__doc__) + +# Author: Raghav RV +# Thomas Unterthiner +# License: BSD 3 clause + +import numpy as np + +import matplotlib as mpl +from matplotlib import pyplot as plt +from matplotlib import cm, gridspec + +from sklearn.preprocessing import MinMaxScaler +from sklearn.preprocessing import minmax_scale +from sklearn.preprocessing import MaxAbsScaler +from sklearn.preprocessing import StandardScaler +from sklearn.preprocessing import RobustScaler +from sklearn.preprocessing import Normalizer +from sklearn.preprocessing.data import QuantileNormalizer + +from sklearn.datasets import fetch_california_housing + +dataset = fetch_california_housing() +X_full, y_full = dataset.data, dataset.target + +# Take only 2 features to make visualization easier +# Feature of 0 has a long tail distribution. +# Feature 5 has a few but very large outliers. + +X = X_full[:, [0, 5]] + +X_min_max_scaled = MinMaxScaler().fit_transform(X) +X_max_abs_scaled = MaxAbsScaler().fit_transform(X) +X_standard_scaled = StandardScaler().fit_transform(X) +X_robust_scaled = RobustScaler(quantile_range=(25, 75)).fit_transform(X) +X_l2_normalized = Normalizer().fit_transform(X) +X_quantile_normalized = QuantileNormalizer().fit_transform(X) + +y = minmax_scale(y_full) # To make colors corresponding to the target + + +def plot_distribution(axes, X, y, hist_nbins=50, plot_title="", size=(15, 10), + X_label="", y_label=""): + ax, hist_X1, hist_X0, empty = axes + empty.axis('off') + + ax.set_title(plot_title, fontsize=12) + ax.set_xlabel(X_label) + ax.set_ylabel(y_label) + + # The scatter plot + colors = cm.plasma_r(y) + ax.scatter(X[:, 0], X[:, 1], alpha=0.5, marker='o', s=5, lw=0, c=colors) + + # Removing the top and the right spine for aesthetics + ax.spines['top'].set_visible(False) + ax.spines['right'].set_visible(False) + ax.get_xaxis().tick_bottom() + ax.get_yaxis().tick_left() + + # Histogram for axis X1 (feature 5) + hist_X1.set_ylim(ax.get_ylim()) + hist_X1.hist(X[:, 1], bins=hist_nbins, orientation='horizontal', + color='grey', ec='grey') + hist_X1.axis('off') + + # Histogram for axis X0 (feature 0) + hist_X0.set_xlim(ax.get_xlim()) + hist_X0.invert_yaxis() + hist_X0.hist(X[:, 0], bins=hist_nbins, orientation='vertical', + color='grey', ec='grey') + hist_X0.axis('off') + +fig = plt.figure(figsize=(15, 50)) +gs = gridspec.GridSpec(15, 5, + width_ratios=[5, 1, 0.1, 5, 1], wspace=0.3, + height_ratios=[5, 1] * 7 + [0.4], hspace=0.4) +subplots = list(plt.subplot(g) for g in gs) + +for i, (X, title) in enumerate(( + (X, "Unscaled data"), + (X_min_max_scaled, "Data after min-max scaling"), + (X_robust_scaled, "Data after robust scaling"), + (X_max_abs_scaled, "Data after max-abs scaling"), + (X_standard_scaled, "Data after standard scaling"), + (X_l2_normalized, "Data after sample-wise L2 normalizing"), + (X_quantile_normalized, "Data after quantile normalizing"))): + offset = 10 * i + + # Distribution with all outliers + axes = subplots[offset:offset + 2] + subplots[offset + 5:offset + 7] + plot_distribution(axes, X, y, hist_nbins=50, + plot_title=title + " including outliers\n", + X_label="Median Income", y_label="Number of households") + + # Some blank vertical space between two plots so they don't overlap + subplots[offset + 2].axis('off') + subplots[offset + 7].axis('off') + + # Distribution with extreme outliers removed + X0_min, X0_99th_pc = np.percentile(X[:, 0], [0, 99]) + X1_min, X1_99th_pc = np.percentile(X[:, 1], [0, 99]) + + non_outliers = np.all(X < [X0_99th_pc, X1_99th_pc], axis=1) + axes = subplots[offset + 3:offset + 5] + subplots[offset + 8:offset + 10] + plot_distribution(axes, X[non_outliers], y[non_outliers], hist_nbins=50, + plot_title=(title + + "\nZoomed-in at percentile range [0, 99)"), + X_label="Median Income", y_label="Number of households") + +# Plot a heatmap legend for the y, combining a row of 4 cols +heatmap_legend_ax = plt.subplot(gs[-5:]) +norm = mpl.colors.Normalize(y_full.min(), y_full.max()) +mpl.colorbar.ColorbarBase(heatmap_legend_ax, cmap=cm.plasma_r, + norm=norm, orientation='horizontal', + label='Color mapping for values of y') +plt.show() diff --git a/examples/preprocessing/plot_robust_scaling.py b/examples/preprocessing/plot_robust_scaling.py deleted file mode 100644 index e752284147b4d..0000000000000 --- a/examples/preprocessing/plot_robust_scaling.py +++ /dev/null @@ -1,84 +0,0 @@ -#!/usr/bin/python -# -*- coding: utf-8 -*- - -""" -========================================================= -Robust Scaling on Toy Data -========================================================= - -Making sure that each Feature has approximately the same scale can be a -crucial preprocessing step. However, when data contains outliers, -:class:`StandardScaler ` can often -be mislead. In such cases, it is better to use a scaler that is robust -against outliers. - -Here, we demonstrate this on a toy dataset, where one single datapoint -is a large outlier. -""" -from __future__ import print_function -print(__doc__) - - -# Code source: Thomas Unterthiner -# License: BSD 3 clause - -import matplotlib.pyplot as plt -import numpy as np -from sklearn.preprocessing import StandardScaler, RobustScaler - -# Create training and test data -np.random.seed(42) -n_datapoints = 100 -Cov = [[0.9, 0.0], [0.0, 20.0]] -mu1 = [100.0, -3.0] -mu2 = [101.0, -3.0] -X1 = np.random.multivariate_normal(mean=mu1, cov=Cov, size=n_datapoints) -X2 = np.random.multivariate_normal(mean=mu2, cov=Cov, size=n_datapoints) -Y_train = np.hstack([[-1]*n_datapoints, [1]*n_datapoints]) -X_train = np.vstack([X1, X2]) - -X1 = np.random.multivariate_normal(mean=mu1, cov=Cov, size=n_datapoints) -X2 = np.random.multivariate_normal(mean=mu2, cov=Cov, size=n_datapoints) -Y_test = np.hstack([[-1]*n_datapoints, [1]*n_datapoints]) -X_test = np.vstack([X1, X2]) - -X_train[0, 0] = -1000 # a fairly large outlier - - -# Scale data -standard_scaler = StandardScaler() -Xtr_s = standard_scaler.fit_transform(X_train) -Xte_s = standard_scaler.transform(X_test) - -robust_scaler = RobustScaler() -Xtr_r = robust_scaler.fit_transform(X_train) -Xte_r = robust_scaler.transform(X_test) - - -# Plot data -fig, ax = plt.subplots(1, 3, figsize=(12, 4)) -ax[0].scatter(X_train[:, 0], X_train[:, 1], - color=np.where(Y_train > 0, 'r', 'b')) -ax[1].scatter(Xtr_s[:, 0], Xtr_s[:, 1], color=np.where(Y_train > 0, 'r', 'b')) -ax[2].scatter(Xtr_r[:, 0], Xtr_r[:, 1], color=np.where(Y_train > 0, 'r', 'b')) -ax[0].set_title("Unscaled data") -ax[1].set_title("After standard scaling (zoomed in)") -ax[2].set_title("After robust scaling (zoomed in)") -# for the scaled data, we zoom in to the data center (outlier can't be seen!) -for a in ax[1:]: - a.set_xlim(-3, 3) - a.set_ylim(-3, 3) -plt.tight_layout() -plt.show() - - -# Classify using k-NN -from sklearn.neighbors import KNeighborsClassifier - -knn = KNeighborsClassifier() -knn.fit(Xtr_s, Y_train) -acc_s = knn.score(Xte_s, Y_test) -print("Testset accuracy using standard scaler: %.3f" % acc_s) -knn.fit(Xtr_r, Y_train) -acc_r = knn.score(Xte_r, Y_test) -print("Testset accuracy using robust scaler: %.3f" % acc_r) From f1282f2ae622700257ffc98890798ce9c759d139 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 22 Feb 2017 18:08:15 +0100 Subject: [PATCH 043/106] TST Validity of output_pdf --- sklearn/preprocessing/tests/test_data.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index 75fc4ff338414..15cae06bc15b1 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -898,6 +898,14 @@ def test_quantile_normalizer_check_error(): "feature than the previously fitted data.", normalizer.inverse_transform, X_bad_feat) + assert_raises_regex(ValueError, "'output_pdf' has to be either 'norm' or" + " 'uniform'. Got rnd instead.", + QuantileNormalizer(output_pdf='rnd').fit_transform, X) + assert_raises_regex(ValueError, "'output_pdf' has to be either 'norm' or" + " 'uniform'. Got rnd instead.", + QuantileNormalizer(output_pdf='rnd').fit( + X).inverse_transform, X) + def test_quantile_normalizer_ignore_zeros(): X = np.array([[0, 0, 0, 0, 0], From 11709a3ec10f58df57a16d1ed57184da37d64ddb Mon Sep 17 00:00:00 2001 From: Raghav RV Date: Wed, 22 Feb 2017 18:42:49 +0100 Subject: [PATCH 044/106] EXA Use OrderedDict; Make it easier to add more transformations --- examples/preprocessing/plot_all_scaling.py | 65 +++++++++++++--------- 1 file changed, 39 insertions(+), 26 deletions(-) diff --git a/examples/preprocessing/plot_all_scaling.py b/examples/preprocessing/plot_all_scaling.py index 80e67570af0c8..708a688980ab9 100644 --- a/examples/preprocessing/plot_all_scaling.py +++ b/examples/preprocessing/plot_all_scaling.py @@ -23,7 +23,10 @@ # Thomas Unterthiner # License: BSD 3 clause +from collections import OrderedDict + import numpy as np +import scipy import matplotlib as mpl from matplotlib import pyplot as plt @@ -48,14 +51,26 @@ X = X_full[:, [0, 5]] -X_min_max_scaled = MinMaxScaler().fit_transform(X) -X_max_abs_scaled = MaxAbsScaler().fit_transform(X) -X_standard_scaled = StandardScaler().fit_transform(X) -X_robust_scaled = RobustScaler(quantile_range=(25, 75)).fit_transform(X) -X_l2_normalized = Normalizer().fit_transform(X) -X_quantile_normalized = QuantileNormalizer().fit_transform(X) - -y = minmax_scale(y_full) # To make colors corresponding to the target +distributions = OrderedDict(( + ('Unscaled data', X), + ('Data after min-max scaling', + MinMaxScaler().fit_transform(X)), + ('Data after robust scaling', + RobustScaler(quantile_range=(25, 75)).fit_transform(X)), + ('Data after max-abs scaling', + MaxAbsScaler().fit_transform(X)), + ('Data after standard scaling', + StandardScaler().fit_transform(X)), + ('Data after sample-wise L2 normalizing', + Normalizer().fit_transform(X)), + ('Data after quantile normalizing (uniform pdf)', + QuantileNormalizer(output_pdf=scipy.stats.uniform) + .fit_transform(X)), + ('Data after quantile normalizing (gaussian pdf)', + QuantileNormalizer(output_pdf=scipy.stats.norm) + .fit_transform(X)))) + +y = minmax_scale(y_full) # To make colors corresponding to the targe), def plot_distribution(axes, X, y, hist_nbins=50, plot_title="", size=(15, 10), @@ -90,25 +105,19 @@ def plot_distribution(axes, X, y, hist_nbins=50, plot_title="", size=(15, 10), color='grey', ec='grey') hist_X0.axis('off') -fig = plt.figure(figsize=(15, 50)) -gs = gridspec.GridSpec(15, 5, +n_dist = len(distributions) +fig = plt.figure(figsize=(15, n_dist * 8 + 1)) +gs = gridspec.GridSpec(n_dist * 2 + 1, 5, width_ratios=[5, 1, 0.1, 5, 1], wspace=0.3, - height_ratios=[5, 1] * 7 + [0.4], hspace=0.4) + height_ratios=[5, 1] * n_dist + [0.4], + hspace=0.4) subplots = list(plt.subplot(g) for g in gs) -for i, (X, title) in enumerate(( - (X, "Unscaled data"), - (X_min_max_scaled, "Data after min-max scaling"), - (X_robust_scaled, "Data after robust scaling"), - (X_max_abs_scaled, "Data after max-abs scaling"), - (X_standard_scaled, "Data after standard scaling"), - (X_l2_normalized, "Data after sample-wise L2 normalizing"), - (X_quantile_normalized, "Data after quantile normalizing"))): +for i, (title, X) in enumerate(distributions.items()): offset = 10 * i - # Distribution with all outliers axes = subplots[offset:offset + 2] + subplots[offset + 5:offset + 7] - plot_distribution(axes, X, y, hist_nbins=50, + plot_distribution(axes, X, y, hist_nbins=200, plot_title=title + " including outliers\n", X_label="Median Income", y_label="Number of households") @@ -116,15 +125,19 @@ def plot_distribution(axes, X, y, hist_nbins=50, plot_title="", size=(15, 10), subplots[offset + 2].axis('off') subplots[offset + 7].axis('off') + zoom_in_percentile_range = (0, 99) # Distribution with extreme outliers removed - X0_min, X0_99th_pc = np.percentile(X[:, 0], [0, 99]) - X1_min, X1_99th_pc = np.percentile(X[:, 1], [0, 99]) + cutoffs_X0 = np.percentile(X[:, 0], zoom_in_percentile_range) + cutoffs_X1 = np.percentile(X[:, 1], zoom_in_percentile_range) - non_outliers = np.all(X < [X0_99th_pc, X1_99th_pc], axis=1) + non_outliers_mask = ( + np.all(X > [cutoffs_X0[0], cutoffs_X1[0]], axis=1) & + np.all(X < [cutoffs_X0[1], cutoffs_X1[1]], axis=1)) axes = subplots[offset + 3:offset + 5] + subplots[offset + 8:offset + 10] - plot_distribution(axes, X[non_outliers], y[non_outliers], hist_nbins=50, + plot_distribution(axes, X[non_outliers_mask], y[non_outliers_mask], hist_nbins=50, plot_title=(title + - "\nZoomed-in at percentile range [0, 99)"), + "\nZoomed-in at percentile range %s" + % str(zoom_in_percentile_range)), X_label="Median Income", y_label="Number of households") # Plot a heatmap legend for the y, combining a row of 4 cols From cf5fa8d7d32dd80d284e5d7508953df699b9774e Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 23 Feb 2017 17:09:15 +0100 Subject: [PATCH 045/106] FIX PEP8 and replace scipy.stats by str in example --- examples/preprocessing/plot_all_scaling.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) mode change 100644 => 100755 examples/preprocessing/plot_all_scaling.py diff --git a/examples/preprocessing/plot_all_scaling.py b/examples/preprocessing/plot_all_scaling.py old mode 100644 new mode 100755 index 708a688980ab9..adf767a2b8bbe --- a/examples/preprocessing/plot_all_scaling.py +++ b/examples/preprocessing/plot_all_scaling.py @@ -64,13 +64,13 @@ ('Data after sample-wise L2 normalizing', Normalizer().fit_transform(X)), ('Data after quantile normalizing (uniform pdf)', - QuantileNormalizer(output_pdf=scipy.stats.uniform) + QuantileNormalizer(output_pdf='uniform') .fit_transform(X)), ('Data after quantile normalizing (gaussian pdf)', - QuantileNormalizer(output_pdf=scipy.stats.norm) + QuantileNormalizer(output_pdf='norm') .fit_transform(X)))) -y = minmax_scale(y_full) # To make colors corresponding to the targe), +y = minmax_scale(y_full) # To make colors corresponding to the target), def plot_distribution(axes, X, y, hist_nbins=50, plot_title="", size=(15, 10), @@ -134,7 +134,8 @@ def plot_distribution(axes, X, y, hist_nbins=50, plot_title="", size=(15, 10), np.all(X > [cutoffs_X0[0], cutoffs_X1[0]], axis=1) & np.all(X < [cutoffs_X0[1], cutoffs_X1[1]], axis=1)) axes = subplots[offset + 3:offset + 5] + subplots[offset + 8:offset + 10] - plot_distribution(axes, X[non_outliers_mask], y[non_outliers_mask], hist_nbins=50, + plot_distribution(axes, X[non_outliers_mask], y[non_outliers_mask], + hist_nbins=50, plot_title=(title + "\nZoomed-in at percentile range %s" % str(zoom_in_percentile_range)), From 0150f6297d945258164e597f75f287ca4b207bb6 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 23 Feb 2017 18:21:44 +0100 Subject: [PATCH 046/106] FIX remove useless import --- examples/preprocessing/plot_all_scaling.py | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/preprocessing/plot_all_scaling.py b/examples/preprocessing/plot_all_scaling.py index adf767a2b8bbe..e634ac7703490 100755 --- a/examples/preprocessing/plot_all_scaling.py +++ b/examples/preprocessing/plot_all_scaling.py @@ -26,7 +26,6 @@ from collections import OrderedDict import numpy as np -import scipy import matplotlib as mpl from matplotlib import pyplot as plt From 81c08cc9a806239e6b980f4a1259704d7b1aaaac Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 27 Feb 2017 21:47:47 +0100 Subject: [PATCH 047/106] COSMET change variable names --- sklearn/preprocessing/data.py | 43 ++++++++++++++++++----------------- 1 file changed, 22 insertions(+), 21 deletions(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 899bff50c3b71..9fb3e4bc95feb 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -1942,9 +1942,9 @@ class QuantileNormalizer(BaseEstimator, TransformerMixin): matrix are discarded to compute the quantile statistics. If false, these entries are accounting for zeros. - output_pdf : str, optional (default='norm') - Probability density function of the normalized data. The choices are - 'norm' (default) or 'uniform'. + output_distribution : str, optional (default='uniform') + Marginal distribution for the transformed data. The choices are + 'uniform' (default) or 'norm'. random_state : int, RandomState instance or None, optional (default=None) If int, random_state is the seed used by the random number generator; @@ -1968,12 +1968,12 @@ class QuantileNormalizer(BaseEstimator, TransformerMixin): """ def __init__(self, n_quantiles=1000, subsample=int(1e5), - ignore_implicit_zeros=False, output_pdf='uniform', + ignore_implicit_zeros=False, output_distribution='uniform', random_state=None): self.n_quantiles = n_quantiles self.subsample = subsample self.ignore_implicit_zeros = ignore_implicit_zeros - self.output_pdf = output_pdf + self.output_distribution = output_distribution self.random_state = random_state def _build_f(self): @@ -2141,7 +2141,7 @@ def _dense_transform(self, X, direction=True): func_transform = self._f_transform else: func_transform = self._f_inverse_transform - class_pdf = getattr(stats, self.output_pdf) + output_distribution = getattr(stats, self.output_distribution) references = np.linspace(0, 1, self.n_quantiles, endpoint=True) @@ -2161,7 +2161,7 @@ def _dense_transform(self, X, direction=True): if not direction: # for inverse transform, match a uniform PDF for i in range(X.shape[0]): - X[i, feature_idx] = class_pdf.cdf( + X[i, feature_idx] = output_distribution.cdf( X[i, feature_idx]) # Avoid computing for bounds due to numerical error of interp1d lower_bounds_idx = (X[:, feature_idx] - BOUNDS_THRESHOLD < @@ -2175,13 +2175,13 @@ def _dense_transform(self, X, direction=True): # for forward transform, match the output PDF if direction: for i in range(X.shape[0]): - X[i, feature_idx] = class_pdf.ppf( + X[i, feature_idx] = output_distribution.ppf( X[i, feature_idx]) # find the value to clip the data to avoid mapping to # infinity. Clip such that the inverse transform will be # consistent - clip_min = class_pdf.ppf(BOUNDS_THRESHOLD / 10) - clip_max = class_pdf.ppf(1 - (BOUNDS_THRESHOLD / 10)) + clip_min = output_distribution.ppf(BOUNDS_THRESHOLD / 10) + clip_max = output_distribution.ppf(1 - (BOUNDS_THRESHOLD / 10)) X[:, feature_idx] = np.clip(X[:, feature_idx], clip_min, clip_max) return X @@ -2208,7 +2208,7 @@ def _sparse_transform(self, X, direction=True): func_transform = self._f_transform else: func_transform = self._f_inverse_transform - class_pdf = getattr(stats, self.output_pdf) + output_distribution = getattr(stats, self.output_distribution) references = np.linspace(0, 1, self.n_quantiles, endpoint=True) @@ -2230,7 +2230,7 @@ def _sparse_transform(self, X, direction=True): # for inverse transform, match a uniform PDF if not direction: for i in range(X.data[column_slice].size): - X.data[column_slice][i] = class_pdf.cdf( + X.data[column_slice][i] = output_distribution.cdf( X.data[column_slice][i]) # Avoid computing for bounds due to numerical error of interp1d @@ -2249,10 +2249,11 @@ def _sparse_transform(self, X, direction=True): # find the value to clip the data to avoid mapping to # infinity. Clip such that the inverse transform will be # consistent. - clip_min = class_pdf.ppf(BOUNDS_THRESHOLD / 10) - clip_max = class_pdf.ppf(1 - (BOUNDS_THRESHOLD / 10)) + clip_min = output_distribution.ppf(BOUNDS_THRESHOLD / 10) + clip_max = output_distribution.ppf(1 - ( + BOUNDS_THRESHOLD / 10)) for i in range(X.data[column_slice].size): - X.data[column_slice][i] = class_pdf.ppf( + X.data[column_slice][i] = output_distribution.ppf( X.data[column_slice][i]) if X.data[column_slice][i] > clip_max: X.data[column_slice][i] = clip_max @@ -2289,10 +2290,10 @@ def transform(self, X): ' the previously fitted data. Got {} instead of' ' {}'.format(X.shape[1], len(self._f_transform))) # check the output PDF - if self.output_pdf not in ('norm', 'uniform'): - raise ValueError("'output_pdf' has to be either 'norm' or" + if self.output_distribution not in ('norm', 'uniform'): + raise ValueError("'output_distribution' has to be either 'norm' or" " 'uniform'. Got {} instead.".format( - self.output_pdf)) + self.output_distribution)) if sparse.issparse(X): return self._sparse_transform(X, True) @@ -2326,10 +2327,10 @@ def inverse_transform(self, X): ' {}'.format(X.shape[1], len(self._f_inverse_transform))) # check the output PDF - if self.output_pdf not in ('norm', 'uniform'): - raise ValueError("'output_pdf' has to be either 'norm' or" + if self.output_distribution not in ('norm', 'uniform'): + raise ValueError("'output_distribution' has to be either 'norm' or" " 'uniform'. Got {} instead.".format( - self.output_pdf)) + self.output_distribution)) if sparse.issparse(X): return self._sparse_transform(X, False) else: From adde8cf7309f2d5848e880ecde9a4682572447d0 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 27 Feb 2017 23:21:29 +0100 Subject: [PATCH 048/106] FIX change output_pdf occurence to output_distribution --- examples/preprocessing/plot_all_scaling.py | 4 ++-- sklearn/preprocessing/tests/test_data.py | 13 +++++++------ 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/examples/preprocessing/plot_all_scaling.py b/examples/preprocessing/plot_all_scaling.py index e634ac7703490..f1603d4706461 100755 --- a/examples/preprocessing/plot_all_scaling.py +++ b/examples/preprocessing/plot_all_scaling.py @@ -63,10 +63,10 @@ ('Data after sample-wise L2 normalizing', Normalizer().fit_transform(X)), ('Data after quantile normalizing (uniform pdf)', - QuantileNormalizer(output_pdf='uniform') + QuantileNormalizer(output_distribution='uniform') .fit_transform(X)), ('Data after quantile normalizing (gaussian pdf)', - QuantileNormalizer(output_pdf='norm') + QuantileNormalizer(output_distribution='norm') .fit_transform(X)))) y = minmax_scale(y_full) # To make colors corresponding to the target), diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index 15cae06bc15b1..0d47657773892 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -898,12 +898,13 @@ def test_quantile_normalizer_check_error(): "feature than the previously fitted data.", normalizer.inverse_transform, X_bad_feat) - assert_raises_regex(ValueError, "'output_pdf' has to be either 'norm' or" - " 'uniform'. Got rnd instead.", - QuantileNormalizer(output_pdf='rnd').fit_transform, X) - assert_raises_regex(ValueError, "'output_pdf' has to be either 'norm' or" - " 'uniform'. Got rnd instead.", - QuantileNormalizer(output_pdf='rnd').fit( + assert_raises_regex(ValueError, "'output_distribution' has to be either" + " 'norm' or 'uniform'. Got rnd instead.", + QuantileNormalizer( + output_distribution='rnd').fit_transform, X) + assert_raises_regex(ValueError, "'output_distribution' has to be either" + " 'norm' or 'uniform'. Got rnd instead.", + QuantileNormalizer(output_distribution='rnd').fit( X).inverse_transform, X) From fe009c91d59e7781d3a91909b32f4e9019415b9d Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 28 Feb 2017 01:50:30 +0100 Subject: [PATCH 049/106] FIX partial fixies from comments --- sklearn/preprocessing/data.py | 49 ++++++++++++++++------------------- 1 file changed, 22 insertions(+), 27 deletions(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 9fb3e4bc95feb..6621438fba3ed 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -31,6 +31,7 @@ min_max_axis) from ..utils.validation import (check_is_fitted, check_random_state, FLOAT_DTYPES) +from ..utils.random import choice BOUNDS_THRESHOLD = 1e-7 @@ -1934,17 +1935,17 @@ class QuantileNormalizer(BaseEstimator, TransformerMixin): Number of quantiles to be computed. It corresponds to the number of landmarks used to discretize the cumulative density function. - subsample : int, optional (default=1e5) - Maximum number of samples used to estimate the quantiles. + output_distribution : str, optional (default='uniform') + Marginal distribution for the transformed data. The choices are + 'uniform' (default) or 'norm'. ignore_implicit_zeros : bool, optional (default=False) Apply only for sparse matrices. If True, the sparse entries of the matrix are discarded to compute the quantile statistics. If false, these entries are accounting for zeros. - output_distribution : str, optional (default='uniform') - Marginal distribution for the transformed data. The choices are - 'uniform' (default) or 'norm'. + subsample : int, optional (default=1e5) + Maximum number of samples used to estimate the quantiles. random_state : int, RandomState instance or None, optional (default=None) If int, random_state is the seed used by the random number generator; @@ -1967,13 +1968,13 @@ class QuantileNormalizer(BaseEstimator, TransformerMixin): outliers and inliers on the same scale. """ - def __init__(self, n_quantiles=1000, subsample=int(1e5), - ignore_implicit_zeros=False, output_distribution='uniform', + def __init__(self, n_quantiles=1000, output_distribution='uniform', + ignore_implicit_zeros=False, subsample=int(1e5), random_state=None): self.n_quantiles = n_quantiles - self.subsample = subsample - self.ignore_implicit_zeros = ignore_implicit_zeros self.output_distribution = output_distribution + self.ignore_implicit_zeros = ignore_implicit_zeros + self.subsample = subsample self.random_state = random_state def _build_f(self): @@ -2017,13 +2018,11 @@ def _dense_fit(self, X): # for compatibility issue with numpy<=1.8.X, references # need to be a list scaled between 0 and 100 - references = np.linspace(0, 1, self.n_quantiles, + references = np.linspace(0, 100, self.n_quantiles, endpoint=True).tolist() - # references is a list that we need to scale between - # 0 and 100. - self.quantiles_ = np.array([np.percentile( - X[subsample_idx, feature_idx], [x * 100 for x in references]) - for feature_idx in range(n_features)]).T + self.quantiles_ = np.transpose([np.percentile( + X[subsample_idx, feature_idx], references) + for feature_idx in range(n_features)]) def _sparse_fit(self, X): """Compute percentiles for sparse matrices. @@ -2036,14 +2035,12 @@ def _sparse_fit(self, X): """ rng = check_random_state(self.random_state) - n_samples, n_features = X.get_shape() + n_samples, n_features = X.shape # for compatibility issue with numpy<=1.8.X, references # need to be a list - references = np.linspace(0, 1, self.n_quantiles, + references = np.linspace(0, 100, self.n_quantiles, endpoint=True).tolist() - # references is a list that we need to scale between - # 0 and 100. self.quantiles_ = [] for feature_idx in range(n_features): column_nnz_data = X.data[X.indptr[feature_idx]: @@ -2051,16 +2048,15 @@ def _sparse_fit(self, X): if len(column_nnz_data) > self.subsample: column_subsample = (self.subsample * len(column_nnz_data) // n_samples) - # choice is not available in numpy <= 1.7 - # used permutation instead. - column_idx = rng.permutation(range(len(column_nnz_data))) if self.ignore_implicit_zeros: column_data = np.zeros(shape=column_subsample, dtype=X.dtype) else: column_data = np.zeros(shape=self.subsample, dtype=X.dtype) - column_data[:column_subsample] = column_nnz_data[ - column_idx[:column_subsample]] + column_data[:column_subsample] = choice(column_nnz_data, + size=column_subsample, + replace=False, + random_state=rng) else: if self.ignore_implicit_zeros: column_data = np.zeros(shape=len(column_nnz_data), @@ -2074,9 +2070,8 @@ def _sparse_fit(self, X): self.quantiles_.append([0] * len(references)) else: self.quantiles_.append( - np.percentile(column_data, - [x * 100 for x in references])) - self.quantiles_ = np.array(self.quantiles_).T + np.percentile(column_data, references)) + self.quantiles_ = np.transpose(self.quantiles_) def fit(self, X, y=None): """Compute the quantiles used for normalizing. From 6ec43a8f1eddbb1efe92b8ccc8de5df40e2e08ff Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 28 Feb 2017 11:52:12 +0100 Subject: [PATCH 050/106] COMIT change class name and code structure --- examples/preprocessing/plot_all_scaling.py | 16 +++--- sklearn/preprocessing/__init__.py | 8 +-- sklearn/preprocessing/data.py | 58 +++++++++++----------- sklearn/preprocessing/tests/test_data.py | 58 +++++++++++----------- 4 files changed, 71 insertions(+), 69 deletions(-) diff --git a/examples/preprocessing/plot_all_scaling.py b/examples/preprocessing/plot_all_scaling.py index f1603d4706461..ef354ffeb63c4 100755 --- a/examples/preprocessing/plot_all_scaling.py +++ b/examples/preprocessing/plot_all_scaling.py @@ -16,13 +16,13 @@ This example uses different scalers and normalizers to bring the data within a smaller range. """ -from __future__ import print_function -print(__doc__) # Author: Raghav RV # Thomas Unterthiner # License: BSD 3 clause +from __future__ import print_function + from collections import OrderedDict import numpy as np @@ -37,10 +37,12 @@ from sklearn.preprocessing import StandardScaler from sklearn.preprocessing import RobustScaler from sklearn.preprocessing import Normalizer -from sklearn.preprocessing.data import QuantileNormalizer +from sklearn.preprocessing.data import QuantileTransformer from sklearn.datasets import fetch_california_housing +print(__doc__) + dataset = fetch_california_housing() X_full, y_full = dataset.data, dataset.target @@ -62,11 +64,11 @@ StandardScaler().fit_transform(X)), ('Data after sample-wise L2 normalizing', Normalizer().fit_transform(X)), - ('Data after quantile normalizing (uniform pdf)', - QuantileNormalizer(output_distribution='uniform') + ('Data after quantile transformation (uniform pdf)', + QuantileTransformer(output_distribution='uniform') .fit_transform(X)), - ('Data after quantile normalizing (gaussian pdf)', - QuantileNormalizer(output_distribution='norm') + ('Data after quantile transformation (gaussian pdf)', + QuantileTransformer(output_distribution='norm') .fit_transform(X)))) y = minmax_scale(y_full) # To make colors corresponding to the target), diff --git a/sklearn/preprocessing/__init__.py b/sklearn/preprocessing/__init__.py index 514af29fa5f34..2b105709ffe08 100644 --- a/sklearn/preprocessing/__init__.py +++ b/sklearn/preprocessing/__init__.py @@ -12,7 +12,7 @@ from .data import Normalizer from .data import RobustScaler from .data import StandardScaler -from .data import QuantileNormalizer +from .data import QuantileTransformer from .data import add_dummy_feature from .data import binarize from .data import normalize @@ -20,7 +20,7 @@ from .data import robust_scale from .data import maxabs_scale from .data import minmax_scale -from .data import quantile_normalize +from .data import quantile_transform from .data import OneHotEncoder from .data import PolynomialFeatures @@ -43,7 +43,7 @@ 'MultiLabelBinarizer', 'MinMaxScaler', 'MaxAbsScaler', - 'QuantileNormalizer', + 'QuantileTransformer', 'Normalizer', 'OneHotEncoder', 'RobustScaler', @@ -57,5 +57,5 @@ 'maxabs_scale', 'minmax_scale', 'label_binarize', - 'quantile_normalize', + 'quantile_transform', ] diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 6621438fba3ed..4829782f5437d 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -49,7 +49,7 @@ 'OneHotEncoder', 'RobustScaler', 'StandardScaler', - 'QuantileNormalizer', + 'QuantileTransformer', 'add_dummy_feature', 'binarize', 'normalize', @@ -57,6 +57,7 @@ 'robust_scale', 'maxabs_scale', 'minmax_scale', + 'quantile_transform', ] @@ -1912,16 +1913,15 @@ def transform(self, X): self.categorical_features, copy=True) -class QuantileNormalizer(BaseEstimator, TransformerMixin): - """Normalize features using quantiles information. +class QuantileTransformer(BaseEstimator, TransformerMixin): + """Transform features using quantiles information. - This Normalizer scales the features between 0 and 1, equalizing the - distribution of each feature to a uniform distribution. Therefore, - for a given feature, this normalization tends to spread out the most - frequent values. It also reduces the impact of (marginal) outliers: - this is therefore a robust preprocessing scheme. + This method scales the features to follow a uniform or a normal + distribution. Therefore, for a given feature, this transformation tends + to spread out the most frequent values. It also reduces the impact of + (marginal) outliers: this is therefore a robust preprocessing scheme. - The normalization is applied on each feature independently. + The transformation is applied on each feature independently. The cumulative density function of a feature is used to project the original values. Features values of new/unseen data that fall below or above the fitted range will be mapped to 0 and 1, respectively. @@ -1960,12 +1960,12 @@ class QuantileNormalizer(BaseEstimator, TransformerMixin): See also -------- - :class:`sklearn.preprocessing.StandardScaler` to perform standardization - that is faster, but less robust to outliers. - :class:`sklearn.preprocessing.RobustScaler` to perform robust - standardization that removes the influence of outliers but does not put - outliers and inliers on the same scale. + StandardScaler : perform standardization that is faster, but less robust + to outliers. + + RobustScaler : perform robust standardization that removes the influence + of outliers but does not put outliers and inliers on the same scale. """ def __init__(self, n_quantiles=1000, output_distribution='uniform', @@ -2014,15 +2014,15 @@ def _dense_fit(self, X): if self.subsample < n_samples: subsample_idx = rng.permutation(range(n_samples))[:self.subsample] else: - subsample_idx = range(n_samples) + subsample_idx = slice(None) # for compatibility issue with numpy<=1.8.X, references # need to be a list scaled between 0 and 100 references = np.linspace(0, 100, self.n_quantiles, endpoint=True).tolist() - self.quantiles_ = np.transpose([np.percentile( - X[subsample_idx, feature_idx], references) - for feature_idx in range(n_features)]) + self.quantiles_ = np.transpose( + [np.percentile(X[subsample_idx, feature_idx], references) + for feature_idx in range(n_features)]) def _sparse_fit(self, X): """Compute percentiles for sparse matrices. @@ -2074,7 +2074,7 @@ def _sparse_fit(self, X): self.quantiles_ = np.transpose(self.quantiles_) def fit(self, X, y=None): - """Compute the quantiles used for normalizing. + """Compute the quantiles used for transforming. Parameters ---------- @@ -2103,7 +2103,7 @@ def fit(self, X, y=None): # we only accept positive sparse matrix if sparse.issparse(X) and np.any(X.data < 0): - raise ValueError('QuantileNormalizer only accepts non-negative' + raise ValueError('QuantileTransformer only accepts non-negative' ' sparse matrices') if sparse.issparse(X): @@ -2257,7 +2257,7 @@ def _sparse_transform(self, X, direction=True): return X def transform(self, X): - """Feature-wise normalization of the data. + """Feature-wise transformation of the data. Parameters ---------- @@ -2276,7 +2276,7 @@ def transform(self, X): dtype=[np.float64, np.float32]) # we only accept positive sparse matrix if sparse.issparse(X) and np.any(X.data < 0): - raise ValueError('QuantileNormalizer only accepts non-negative' + raise ValueError('QuantileTransformer only accepts non-negative' ' sparse matrices') check_is_fitted(self, '_f_transform') # check that the dimension of X are adequate with the fitted data @@ -2312,7 +2312,7 @@ def inverse_transform(self, X): X = check_array(X, accept_sparse='csc') # we only accept positive sparse matrix if sparse.issparse(X) and np.any(X.data < 0): - raise ValueError('QuantileNormalizer only accepts non-negative' + raise ValueError('QuantileTransformer only accepts non-negative' ' sparse matrices') check_is_fitted(self, '_f_inverse_transform') # check that the dimension of X are adequate with the fitted data @@ -2333,7 +2333,7 @@ def inverse_transform(self, X): def __getstate__(self): """Pickle-protocol - return state of the estimator. """ - state = super(QuantileNormalizer, self).__getstate__() + state = super(QuantileTransformer, self).__getstate__() # remove interpolation method state.pop('_f_transform', None) state.pop('_f_inverse_transform', None) @@ -2343,16 +2343,16 @@ def __setstate__(self, state): """Pickle-protocol - set state of the estimator. We need to rebuild the interpolation function. """ - super(QuantileNormalizer, self).__setstate__(state) + super(QuantileTransformer, self).__setstate__(state) if hasattr(self, 'quantiles_'): self._build_f() -def quantile_normalize(X, axis=0, n_quantiles=1000, subsample=int(1e5), +def quantile_transform(X, axis=0, n_quantiles=1000, subsample=int(1e5), ignore_implicit_zeros=False, random_state=None): - n = QuantileNormalizer(n_quantiles=n_quantiles, subsample=subsample, - ignore_implicit_zeros=ignore_implicit_zeros, - random_state=random_state) + n = QuantileTransformer(n_quantiles=n_quantiles, subsample=subsample, + ignore_implicit_zeros=ignore_implicit_zeros, + random_state=random_state) if axis == 0: return n.fit_transform(X) elif axis == 1: diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index 0d47657773892..ecee0bc4c7354 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -42,8 +42,8 @@ from sklearn.preprocessing.data import scale from sklearn.preprocessing.data import MinMaxScaler from sklearn.preprocessing.data import minmax_scale -from sklearn.preprocessing.data import QuantileNormalizer -from sklearn.preprocessing.data import quantile_normalize +from sklearn.preprocessing.data import QuantileTransformer +from sklearn.preprocessing.data import quantile_transform from sklearn.preprocessing.data import MaxAbsScaler from sklearn.preprocessing.data import maxabs_scale from sklearn.preprocessing.data import RobustScaler @@ -853,9 +853,9 @@ def test_robust_scaler_iris_quantiles(): assert_array_almost_equal(q_range, 1) -def test_quantile_normalizer_iris(): +def test_quantile_transformr_iris(): X = iris.data - normalizer = QuantileNormalizer() + normalizer = QuantileTransformer() X_trans = normalizer.fit_transform(X) assert_array_almost_equal(np.min(X_trans, axis=0), 0.) assert_array_almost_equal(np.max(X_trans, axis=0), 1.) @@ -863,7 +863,7 @@ def test_quantile_normalizer_iris(): assert_array_almost_equal(X, X_trans_inv) -def test_quantile_normalizer_check_error(): +def test_quantile_transformr_check_error(): X = np.array([[0, 25, 50, 0, 0, 0, 75, 0, 0, 100], [2, 4, 0, 0, 6, 8, 0, 10, 0, 0], [0, 0, 2.6, 4.1, 0, 0, 2.3, 0, 9.5, 0.1]]).T @@ -874,18 +874,18 @@ def test_quantile_normalizer_check_error(): X_neg = sparse.csc_matrix(X_neg) assert_raises_regex(ValueError, "Invalid value for 'n_quantiles'", - QuantileNormalizer(n_quantiles=0).fit, X_neg) + QuantileTransformer(n_quantiles=0).fit, X_neg) assert_raises_regex(ValueError, "Invalid value for 'subsample'", - QuantileNormalizer(subsample=0).fit, X_neg) + QuantileTransformer(subsample=0).fit, X_neg) - normalizer = QuantileNormalizer() - assert_raises_regex(ValueError, "QuantileNormalizer only accepts " + normalizer = QuantileTransformer() + assert_raises_regex(ValueError, "QuantileTransformer only accepts " "non-negative sparse matrices", normalizer.fit, X_neg) normalizer.fit(X) - assert_raises_regex(ValueError, "QuantileNormalizer only accepts " + assert_raises_regex(ValueError, "QuantileTransformer only accepts " "non-negative sparse matrices", normalizer.transform, X_neg) - assert_raises_regex(ValueError, "QuantileNormalizer only accepts " + assert_raises_regex(ValueError, "QuantileTransformer only accepts " "non-negative sparse matrices", normalizer.inverse_transform, X_neg) @@ -900,19 +900,19 @@ def test_quantile_normalizer_check_error(): assert_raises_regex(ValueError, "'output_distribution' has to be either" " 'norm' or 'uniform'. Got rnd instead.", - QuantileNormalizer( + QuantileTransformer( output_distribution='rnd').fit_transform, X) assert_raises_regex(ValueError, "'output_distribution' has to be either" " 'norm' or 'uniform'. Got rnd instead.", - QuantileNormalizer(output_distribution='rnd').fit( + QuantileTransformer(output_distribution='rnd').fit( X).inverse_transform, X) -def test_quantile_normalizer_ignore_zeros(): +def test_quantile_transformr_ignore_zeros(): X = np.array([[0, 0, 0, 0, 0], [1, 0, 2, 2, 1]]).T X_sparse = sparse.csc_matrix(X) - nq = QuantileNormalizer(ignore_implicit_zeros=True, n_quantiles=5) + nq = QuantileTransformer(ignore_implicit_zeros=True, n_quantiles=5) # dense case -> warning raise assert_warns_message(UserWarning, "'ignore_implicit_zeros' takes effect" @@ -943,12 +943,12 @@ def test_quantile_normalizer_ignore_zeros(): assert_almost_equal(X_gt, X_trans.A) -def test_quantile_normalizer_dense_toy(): +def test_quantile_transformr_dense_toy(): X = np.array([[0, 25, 50, 75, 100], [2, 4, 6, 8, 10], [2.6, 4.1, 2.3, 9.5, 0.1]]).T - normalizer = QuantileNormalizer() + normalizer = QuantileTransformer() normalizer.fit(X) X_trans = normalizer.fit_transform(X) @@ -977,14 +977,14 @@ def test_quantile_normalizer_dense_toy(): X_trans = normalizer.fit_transform(X) -def test_quantile_normalizer_sparse_toy(): +def test_quantile_transformr_sparse_toy(): X = np.array([[0, 25, 50, 0, 0, 0, 75, 0, 0, 100], [2, 4, 0, 0, 6, 8, 0, 10, 0, 0], [0, 0, 2.6, 4.1, 0, 0, 2.3, 0, 9.5, 0.1]]).T X = sparse.csc_matrix(X) - normalizer = QuantileNormalizer() + normalizer = QuantileTransformer() normalizer.fit(X) X_trans = normalizer.fit_transform(X) @@ -994,7 +994,7 @@ def test_quantile_normalizer_sparse_toy(): X_trans_inv = normalizer.inverse_transform(X_trans) assert_array_almost_equal(X.toarray(), X_trans_inv.toarray()) - normalizer_dense = QuantileNormalizer().fit(X.toarray()) + normalizer_dense = QuantileTransformer().fit(X.toarray()) X_trans = normalizer_dense.transform(X) assert_array_almost_equal(np.min(X_trans.toarray(), axis=0), 0.) @@ -1012,13 +1012,13 @@ def test_quantile_normalizer_sparse_toy(): X_trans = normalizer.fit_transform(X) -def test_quantile_normalize_axis1(): +def test_quantile_transform_axis1(): X = np.array([[0, 25, 50, 75, 100], [2, 4, 6, 8, 10], [2.6, 4.1, 2.3, 9.5, 0.1]]) - X_trans_a0 = quantile_normalize(X.T, axis=0) - X_trans_a1 = quantile_normalize(X, axis=1) + X_trans_a0 = quantile_transform(X.T, axis=0) + X_trans_a1 = quantile_transform(X, axis=1) assert_array_almost_equal(X_trans_a0, X_trans_a1.T) @@ -1029,9 +1029,9 @@ def test_qunatile_normalzer_bounds(): X_sparse = sparse.csc_matrix(X_dense) # check sparse and dense are consistent - X_trans = QuantileNormalizer().fit_transform(X_dense) + X_trans = QuantileTransformer().fit_transform(X_dense) assert_array_almost_equal(X_trans, X_dense) - X_trans_sp = QuantileNormalizer().fit_transform(X_sparse) + X_trans_sp = QuantileTransformer().fit_transform(X_sparse) assert_array_almost_equal(X_trans_sp.A, X_dense) assert_array_almost_equal(X_trans, X_trans_sp.A) @@ -1041,13 +1041,13 @@ def test_qunatile_normalzer_bounds(): [1, 0.5, 0]]).T X1 = np.array([[0, 0, 1], [0.1, 0.5, 0.1]]).T - qn = QuantileNormalizer().fit(X) + qn = QuantileTransformer().fit(X) X_trans = qn.transform(X1) assert_array_almost_equal(X_trans, X1) -def test_quantile_normalizer_pickling(): - qn = QuantileNormalizer(n_quantiles=100) +def test_quantile_transformr_pickling(): + qn = QuantileTransformer(n_quantiles=100) qn_ser = pickle.dumps(qn, pickle.HIGHEST_PROTOCOL) qn2 = pickle.loads(qn_ser) @@ -1858,7 +1858,7 @@ def test_function_valid_axis(): [2, 4, 6, 8, 10], [2.6, 4.1, 2.3, 9.5, 0.1]]) - func_list = [quantile_normalize] + func_list = [quantile_transform] for func in func_list: assert_raises_regex(ValueError, "axis should be either equal to 0 or 1" From e94cd485df595000ec894ea03acb70c7cedc08fc Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 28 Feb 2017 13:14:30 +0100 Subject: [PATCH 051/106] COSMIT change direction to inverse --- sklearn/preprocessing/data.py | 36 +++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 4829782f5437d..eadea760e9b35 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -2115,7 +2115,7 @@ def fit(self, X, y=None): return self - def _dense_transform(self, X, direction=True): + def _dense_transform(self, X, inverse=False): """Forward and inverse transform for dense matrices. Parameters @@ -2123,8 +2123,8 @@ def _dense_transform(self, X, direction=True): X : ndarray, shape (n_samples, n_features) The data used to scale along the features axis. - direction : bool, optional (default=True) - If True, apply forward transform. If False, apply + inverse : bool, optional (default=False) + If False, apply forward transform. If True, apply inverse transform. Returns @@ -2132,7 +2132,7 @@ def _dense_transform(self, X, direction=True): X : ndarray, shape (n_samples, n_features) Projected data. """ - if direction: + if not inverse: func_transform = self._f_transform else: func_transform = self._f_inverse_transform @@ -2143,7 +2143,7 @@ def _dense_transform(self, X, direction=True): for feature_idx, f in enumerate(func_transform): # older version of scipy do not handle tuple as fill_value # clipping the value before transform solve the issue - if direction: + if not inverse: lower_bound_x = self.quantiles_[0, feature_idx] upper_bound_x = self.quantiles_[-1, feature_idx] lower_bound_y = references[0] @@ -2153,7 +2153,7 @@ def _dense_transform(self, X, direction=True): upper_bound_x = references[-1] lower_bound_y = self.quantiles_[0, feature_idx] upper_bound_y = self.quantiles_[-1, feature_idx] - if not direction: + if inverse: # for inverse transform, match a uniform PDF for i in range(X.shape[0]): X[i, feature_idx] = output_distribution.cdf( @@ -2168,7 +2168,7 @@ def _dense_transform(self, X, direction=True): X[upper_bounds_idx, feature_idx] = upper_bound_y X[lower_bounds_idx, feature_idx] = lower_bound_y # for forward transform, match the output PDF - if direction: + if not inverse: for i in range(X.shape[0]): X[i, feature_idx] = output_distribution.ppf( X[i, feature_idx]) @@ -2181,7 +2181,7 @@ def _dense_transform(self, X, direction=True): clip_max) return X - def _sparse_transform(self, X, direction=True): + def _sparse_transform(self, X, inverse=False): """Forward and inverse transform for sparse matrices. Parameters @@ -2190,8 +2190,8 @@ def _sparse_transform(self, X, direction=True): The data used to scale along the features axis. The sparse matrix needs to be semi-positive. - direction : bool, optional (default=True) - If True, apply forward transform. If False, apply + inverse : bool, optional (default=False) + If False, apply forward transform. If True, apply inverse transform. Returns @@ -2199,7 +2199,7 @@ def _sparse_transform(self, X, direction=True): X : sparse matrix CSC, shape (n_samples, n_features) Projected data. """ - if direction: + if not inverse: func_transform = self._f_transform else: func_transform = self._f_inverse_transform @@ -2212,7 +2212,7 @@ def _sparse_transform(self, X, direction=True): X.indptr[feature_idx + 1]) # older version of scipy do not handle tuple as fill_value # clipping the value before transform solve the issue - if direction: + if not inverse: lower_bound_x = self.quantiles_[0, feature_idx] upper_bound_x = self.quantiles_[-1, feature_idx] lower_bound_y = references[0] @@ -2223,7 +2223,7 @@ def _sparse_transform(self, X, direction=True): lower_bound_y = self.quantiles_[0, feature_idx] upper_bound_y = self.quantiles_[-1, feature_idx] # for inverse transform, match a uniform PDF - if not direction: + if inverse: for i in range(X.data[column_slice].size): X.data[column_slice][i] = output_distribution.cdf( X.data[column_slice][i]) @@ -2240,7 +2240,7 @@ def _sparse_transform(self, X, direction=True): X.data[column_slice][upper_bounds_idx] = upper_bound_y X.data[column_slice][lower_bounds_idx] = lower_bound_y # for forward transform, match the output PDF - if direction: + if not inverse: # find the value to clip the data to avoid mapping to # infinity. Clip such that the inverse transform will be # consistent. @@ -2291,9 +2291,9 @@ def transform(self, X): self.output_distribution)) if sparse.issparse(X): - return self._sparse_transform(X, True) + return self._sparse_transform(X, inverse=False) else: - return self._dense_transform(X, True) + return self._dense_transform(X, inverse=False) def inverse_transform(self, X): """Back-projection to the original space. @@ -2327,9 +2327,9 @@ def inverse_transform(self, X): " 'uniform'. Got {} instead.".format( self.output_distribution)) if sparse.issparse(X): - return self._sparse_transform(X, False) + return self._sparse_transform(X, inverse=True) else: - return self._dense_transform(X, False) + return self._dense_transform(X, inverse=True) def __getstate__(self): """Pickle-protocol - return state of the estimator. """ From 9c13d2a88124610f29ad49bc966b767652fdc86e Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 28 Feb 2017 16:04:49 +0100 Subject: [PATCH 052/106] FIX factorize transform in _transform_col --- sklearn/preprocessing/data.py | 147 ++++++++++++---------------------- 1 file changed, 50 insertions(+), 97 deletions(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index eadea760e9b35..3f31b10ed993c 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -2115,6 +2115,49 @@ def fit(self, X, y=None): return self + def _transform_col(self, X_col, feature_idx, inverse): + """Private function to transform a single feature""" + + if not inverse: + func_transform = self._f_transform[feature_idx] + else: + func_transform = self._f_inverse_transform[feature_idx] + output_distribution = getattr(stats, self.output_distribution) + # older version of scipy do not handle tuple as fill_value + # clipping the value before transform solve the issue + if not inverse: + lower_bound_x = self.quantiles_[0, feature_idx] + upper_bound_x = self.quantiles_[-1, feature_idx] + lower_bound_y = 0 + upper_bound_y = 1 + else: + lower_bound_x = 0 + upper_bound_x = 1 + lower_bound_y = self.quantiles_[0, feature_idx] + upper_bound_y = self.quantiles_[-1, feature_idx] + # for inverse transform, match a uniform PDF + X_col = output_distribution.cdf(X_col) + # find index for lower and higher bounds + lower_bounds_idx = (X_col - BOUNDS_THRESHOLD < + lower_bound_x) + upper_bounds_idx = (X_col + BOUNDS_THRESHOLD > + upper_bound_x) + X_col = func_transform(X_col) + X_col[upper_bounds_idx] = upper_bound_y + X_col[lower_bounds_idx] = lower_bound_y + # for forward transform, match the output PDF + if not inverse: + X_col = output_distribution.ppf(X_col) + # find the value to clip the data to avoid mapping to + # infinity. Clip such that the inverse transform will be + # consistent + clip_min = output_distribution.ppf(BOUNDS_THRESHOLD / 10) + clip_max = output_distribution.ppf(1 - (BOUNDS_THRESHOLD / 10)) + X_col = np.clip(X_col, clip_min, clip_max) + + return X_col + + def _dense_transform(self, X, inverse=False): """Forward and inverse transform for dense matrices. @@ -2132,53 +2175,11 @@ def _dense_transform(self, X, inverse=False): X : ndarray, shape (n_samples, n_features) Projected data. """ - if not inverse: - func_transform = self._f_transform - else: - func_transform = self._f_inverse_transform - output_distribution = getattr(stats, self.output_distribution) - references = np.linspace(0, 1, self.n_quantiles, endpoint=True) + for feature_idx in range(X.shape[1]): + X[:, feature_idx] = self._transform_col(X[:, feature_idx], + feature_idx, inverse) - for feature_idx, f in enumerate(func_transform): - # older version of scipy do not handle tuple as fill_value - # clipping the value before transform solve the issue - if not inverse: - lower_bound_x = self.quantiles_[0, feature_idx] - upper_bound_x = self.quantiles_[-1, feature_idx] - lower_bound_y = references[0] - upper_bound_y = references[-1] - else: - lower_bound_x = references[0] - upper_bound_x = references[-1] - lower_bound_y = self.quantiles_[0, feature_idx] - upper_bound_y = self.quantiles_[-1, feature_idx] - if inverse: - # for inverse transform, match a uniform PDF - for i in range(X.shape[0]): - X[i, feature_idx] = output_distribution.cdf( - X[i, feature_idx]) - # Avoid computing for bounds due to numerical error of interp1d - lower_bounds_idx = (X[:, feature_idx] - BOUNDS_THRESHOLD < - lower_bound_x) - upper_bounds_idx = (X[:, feature_idx] + BOUNDS_THRESHOLD > - upper_bound_x) - bounds_idx = np.bitwise_or(lower_bounds_idx, upper_bounds_idx) - X[~bounds_idx, feature_idx] = f(X[~bounds_idx, feature_idx]) - X[upper_bounds_idx, feature_idx] = upper_bound_y - X[lower_bounds_idx, feature_idx] = lower_bound_y - # for forward transform, match the output PDF - if not inverse: - for i in range(X.shape[0]): - X[i, feature_idx] = output_distribution.ppf( - X[i, feature_idx]) - # find the value to clip the data to avoid mapping to - # infinity. Clip such that the inverse transform will be - # consistent - clip_min = output_distribution.ppf(BOUNDS_THRESHOLD / 10) - clip_max = output_distribution.ppf(1 - (BOUNDS_THRESHOLD / 10)) - X[:, feature_idx] = np.clip(X[:, feature_idx], clip_min, - clip_max) return X def _sparse_transform(self, X, inverse=False): @@ -2199,61 +2200,13 @@ def _sparse_transform(self, X, inverse=False): X : sparse matrix CSC, shape (n_samples, n_features) Projected data. """ - if not inverse: - func_transform = self._f_transform - else: - func_transform = self._f_inverse_transform - output_distribution = getattr(stats, self.output_distribution) - - references = np.linspace(0, 1, self.n_quantiles, endpoint=True) - for feature_idx, f in enumerate(func_transform): + for feature_idx in range(X.shape[1]): column_slice = slice(X.indptr[feature_idx], X.indptr[feature_idx + 1]) - # older version of scipy do not handle tuple as fill_value - # clipping the value before transform solve the issue - if not inverse: - lower_bound_x = self.quantiles_[0, feature_idx] - upper_bound_x = self.quantiles_[-1, feature_idx] - lower_bound_y = references[0] - upper_bound_y = references[-1] - else: - lower_bound_x = references[0] - upper_bound_x = references[-1] - lower_bound_y = self.quantiles_[0, feature_idx] - upper_bound_y = self.quantiles_[-1, feature_idx] - # for inverse transform, match a uniform PDF - if inverse: - for i in range(X.data[column_slice].size): - X.data[column_slice][i] = output_distribution.cdf( - X.data[column_slice][i]) - - # Avoid computing for bounds due to numerical error of interp1d - # Check that there is value - if X.data[column_slice].size: - lower_bounds_idx = (X.data[column_slice] - BOUNDS_THRESHOLD < - lower_bound_x) - upper_bounds_idx = (X.data[column_slice] + BOUNDS_THRESHOLD > - upper_bound_x) - X.data[column_slice][~upper_bounds_idx] = f( - X.data[column_slice][~upper_bounds_idx]) - X.data[column_slice][upper_bounds_idx] = upper_bound_y - X.data[column_slice][lower_bounds_idx] = lower_bound_y - # for forward transform, match the output PDF - if not inverse: - # find the value to clip the data to avoid mapping to - # infinity. Clip such that the inverse transform will be - # consistent. - clip_min = output_distribution.ppf(BOUNDS_THRESHOLD / 10) - clip_max = output_distribution.ppf(1 - ( - BOUNDS_THRESHOLD / 10)) - for i in range(X.data[column_slice].size): - X.data[column_slice][i] = output_distribution.ppf( - X.data[column_slice][i]) - if X.data[column_slice][i] > clip_max: - X.data[column_slice][i] = clip_max - elif X.data[column_slice][i] < clip_min: - X.data[column_slice][i] = clip_min + X.data[column_slice] = self._transform_col(X.data[column_slice], + feature_idx, inverse) + return X def transform(self, X): From 5d544efb7005c133e5eda2c20e3b23f32bc1f5e0 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 28 Feb 2017 16:05:38 +0100 Subject: [PATCH 053/106] PEP8 --- sklearn/preprocessing/data.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 3f31b10ed993c..89afe8f2ad6a7 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -2157,7 +2157,6 @@ def _transform_col(self, X_col, feature_idx, inverse): return X_col - def _dense_transform(self, X, inverse=False): """Forward and inverse transform for dense matrices. From d9b3e7a5045132207791e122f9d7f961bcb7268f Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 28 Feb 2017 16:13:52 +0100 Subject: [PATCH 054/106] FIX change the magic 10 --- sklearn/preprocessing/data.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 89afe8f2ad6a7..35f4156919b51 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -2151,8 +2151,10 @@ def _transform_col(self, X_col, feature_idx, inverse): # find the value to clip the data to avoid mapping to # infinity. Clip such that the inverse transform will be # consistent - clip_min = output_distribution.ppf(BOUNDS_THRESHOLD / 10) - clip_max = output_distribution.ppf(1 - (BOUNDS_THRESHOLD / 10)) + clip_min = output_distribution.ppf(BOUNDS_THRESHOLD - + np.spacing(1)) + clip_max = output_distribution.ppf(1 - (BOUNDS_THRESHOLD - + np.spacing(1))) X_col = np.clip(X_col, clip_min, clip_max) return X_col From 23b3a913cec5697de569670a6b6f27550f2e3c80 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 28 Feb 2017 16:44:22 +0100 Subject: [PATCH 055/106] FIX add interp1d to fixes --- sklearn/preprocessing/data.py | 12 ++++++++---- sklearn/utils/fixes.py | 11 +++++++++++ 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 35f4156919b51..3425278148fda 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -15,7 +15,7 @@ import numpy as np from scipy import sparse -from scipy.interpolate import interp1d +# from scipy.interpolate import interp1d from scipy import stats from ..base import BaseEstimator, TransformerMixin @@ -23,7 +23,7 @@ from ..utils import check_array from ..utils.extmath import row_norms from ..utils.extmath import _incremental_mean_and_var -from ..utils.fixes import bincount +from ..utils.fixes import bincount, interp1d from ..utils.sparsefuncs_fast import (inplace_csr_row_normalize_l1, inplace_csr_row_normalize_l2) from ..utils.sparsefuncs import (inplace_column_scale, @@ -1989,14 +1989,18 @@ def _build_f(self): self._f_transform = tuple([ interp1d(quantiles_feature, references, + copy=False, bounds_error=False, - fill_value=0.) + fill_value=0., + assume_sorted=True) for quantiles_feature in self.quantiles_.T]) self._f_inverse_transform = tuple([ interp1d(references, quantiles_feature, + copy=False, bounds_error=False, - fill_value=0.) + fill_value=0., + assume_sorted=True) for quantiles_feature in self.quantiles_.T]) def _dense_fit(self, X): diff --git a/sklearn/utils/fixes.py b/sklearn/utils/fixes.py index d789d5f525cd4..378b540bfdeab 100644 --- a/sklearn/utils/fixes.py +++ b/sklearn/utils/fixes.py @@ -362,6 +362,17 @@ def rankdata(a, method='average'): from scipy.stats import rankdata +if sp_version < (0, 14, 0): + def interp1d(x, y, kind='linear', axis=-1, + copy=True, bounds_error=True, + fill_value=np.nan, assume_sorted=False): + + return scipy.interpolate.interp1d(x, y, kind, axis, copy, + bounds_error, fill_value) +else: + from scipy.interpolate import interp1d + + if np_version < (1, 12): class MaskedArray(np.ma.MaskedArray): # Before numpy 1.12, np.ma.MaskedArray object is not picklable From 04dc89a58a0704f9257e7726e3fbad314087dee6 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 28 Feb 2017 17:54:09 +0100 Subject: [PATCH 056/106] FIX/TST allow negative entries when ignore_implicit_zeros is True --- sklearn/preprocessing/data.py | 23 ++++++++++++++--------- sklearn/preprocessing/tests/test_data.py | 16 ++++++++++++++++ sklearn/utils/fixes.py | 2 ++ 3 files changed, 32 insertions(+), 9 deletions(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 3425278148fda..dffa93f7bedd9 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -2105,8 +2105,10 @@ def fit(self, X, y=None): "The number of quantiles must be at least one." % self.subsample) - # we only accept positive sparse matrix - if sparse.issparse(X) and np.any(X.data < 0): + # we only accept positive sparse matrix when ignore_implicit_zeros is + # false + if (not self.ignore_implicit_zeros and + (sparse.issparse(X) and np.any(X.data < 0))): raise ValueError('QuantileTransformer only accepts non-negative' ' sparse matrices') @@ -2177,9 +2179,8 @@ def _dense_transform(self, X, inverse=False): Returns ------- - X : ndarray, shape (n_samples, n_features) - Projected data. - """ + X : ndarray, shape (n_samples, n_features) + Projected data """ for feature_idx in range(X.shape[1]): X[:, feature_idx] = self._transform_col(X[:, feature_idx], @@ -2232,8 +2233,10 @@ def transform(self, X): """ X = check_array(X, accept_sparse='csc', copy=True, dtype=[np.float64, np.float32]) - # we only accept positive sparse matrix - if sparse.issparse(X) and np.any(X.data < 0): + # we only accept positive sparse matrix when ignore_implicit_zeros is + # false + if (not self.ignore_implicit_zeros and + (sparse.issparse(X) and np.any(X.data < 0))): raise ValueError('QuantileTransformer only accepts non-negative' ' sparse matrices') check_is_fitted(self, '_f_transform') @@ -2268,8 +2271,10 @@ def inverse_transform(self, X): The projected data. """ X = check_array(X, accept_sparse='csc') - # we only accept positive sparse matrix - if sparse.issparse(X) and np.any(X.data < 0): + # we only accept positive sparse matrix when ignore_implicit_zeros is + # false + if (not self.ignore_implicit_zeros and + (sparse.issparse(X) and np.any(X.data < 0))): raise ValueError('QuantileTransformer only accepts non-negative' ' sparse matrices') check_is_fitted(self, '_f_inverse_transform') diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index ecee0bc4c7354..493a4e77256be 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -942,6 +942,22 @@ def test_quantile_transformr_ignore_zeros(): [0., 0.]]) assert_almost_equal(X_gt, X_trans.A) + nq = QuantileTransformer(ignore_implicit_zeros=True, n_quantiles=5) + X_data = np.array([-1, -1, 1, 0, 0, 0, 1, -1, 1]) + X_col = np.array([0, 0, 1, 1, 1, 1, 1, 1, 1]) + X_row = np.array([0, 4, 0, 1, 2, 3, 4, 5, 6]) + X_sparse = sparse.csc_matrix((X_data, (X_row, X_col))) + X_trans = nq.fit_transform(X_sparse) + X_gt = np.array([[0, 1], + [0, 0.5], + [0, 0.5], + [0, 0.5], + [0, 1], + [0, 0], + [0, 1]]) + assert_almost_equal(X_gt, X_trans.A) + assert_almost_equal(X_sparse.A, nq.inverse_transform(X_trans).A) + def test_quantile_transformr_dense_toy(): X = np.array([[0, 25, 50, 75, 100], diff --git a/sklearn/utils/fixes.py b/sklearn/utils/fixes.py index 378b540bfdeab..ceb43fee2a8a0 100644 --- a/sklearn/utils/fixes.py +++ b/sklearn/utils/fixes.py @@ -363,6 +363,8 @@ def rankdata(a, method='average'): if sp_version < (0, 14, 0): + # Before scipy 0.14.0, interp1d does not accept any assume_sorted + # argument. def interp1d(x, y, kind='linear', axis=-1, copy=True, bounds_error=True, fill_value=np.nan, assume_sorted=False): From 9377cc229630d111d9c08fb74873414e6ab43fa5 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 1 Mar 2017 15:38:52 +0100 Subject: [PATCH 057/106] FIX use np.interp instead of sp.interpolate.interp1d --- sklearn/preprocessing/data.py | 131 ++++++++--------------- sklearn/preprocessing/tests/test_data.py | 12 +-- sklearn/utils/fixes.py | 13 --- 3 files changed, 48 insertions(+), 108 deletions(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index dffa93f7bedd9..f022b79b4cb79 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -15,7 +15,6 @@ import numpy as np from scipy import sparse -# from scipy.interpolate import interp1d from scipy import stats from ..base import BaseEstimator, TransformerMixin @@ -23,7 +22,7 @@ from ..utils import check_array from ..utils.extmath import row_norms from ..utils.extmath import _incremental_mean_and_var -from ..utils.fixes import bincount, interp1d +from ..utils.fixes import bincount from ..utils.sparsefuncs_fast import (inplace_csr_row_normalize_l1, inplace_csr_row_normalize_l2) from ..utils.sparsefuncs import (inplace_column_scale, @@ -1977,32 +1976,6 @@ def __init__(self, n_quantiles=1000, output_distribution='uniform', self.subsample = subsample self.random_state = random_state - def _build_f(self): - """Build the transform functions.""" - check_is_fitted(self, 'quantiles_') - - if self.ignore_implicit_zeros: - warnings.warn("'ignore_implicit_zeros' takes effect only with" - " sparse matrix. This parameter has no effect.") - - references = np.linspace(0, 1, self.n_quantiles, endpoint=True) - - self._f_transform = tuple([ - interp1d(quantiles_feature, references, - copy=False, - bounds_error=False, - fill_value=0., - assume_sorted=True) - for quantiles_feature in self.quantiles_.T]) - - self._f_inverse_transform = tuple([ - interp1d(references, quantiles_feature, - copy=False, - bounds_error=False, - fill_value=0., - assume_sorted=True) - for quantiles_feature in self.quantiles_.T]) - def _dense_fit(self, X): """Compute percentiles for dense matrices. @@ -2011,6 +1984,10 @@ def _dense_fit(self, X): X : ndarray, shape (n_samples, n_features) The data used to scale along the features axis. """ + if self.ignore_implicit_zeros: + warnings.warn("'ignore_implicit_zeros' takes effect only with" + " sparse matrix. This parameter has no effect.") + rng = check_random_state(self.random_state) # subsample the matrix X if necessary @@ -2117,17 +2094,11 @@ def fit(self, X, y=None): else: self._dense_fit(X) - self._build_f() - return self def _transform_col(self, X_col, feature_idx, inverse): """Private function to transform a single feature""" - if not inverse: - func_transform = self._f_transform[feature_idx] - else: - func_transform = self._f_inverse_transform[feature_idx] output_distribution = getattr(stats, self.output_distribution) # older version of scipy do not handle tuple as fill_value # clipping the value before transform solve the issue @@ -2148,7 +2119,15 @@ def _transform_col(self, X_col, feature_idx, inverse): lower_bound_x) upper_bounds_idx = (X_col + BOUNDS_THRESHOLD > upper_bound_x) - X_col = func_transform(X_col) + + references = np.linspace(0, 1, self.n_quantiles, endpoint=True) + if not inverse: + X_col = np.interp(X_col, self.quantiles_[:, feature_idx], + references) + else: + X_col = np.interp(X_col, references, + self.quantiles_[:, feature_idx]) + X_col[upper_bounds_idx] = upper_bound_y X_col[lower_bounds_idx] = lower_bound_y # for forward transform, match the output PDF @@ -2215,22 +2194,8 @@ def _sparse_transform(self, X, inverse=False): return X - def transform(self, X): - """Feature-wise transformation of the data. - - Parameters - ---------- - X : ndarray or sparse matrix, shape (n_samples, n_features) - The data to be normalized along the features axis. If a sparse - matrix is provided, it will be converted into a sparse - ``csc_matrix``. Additionally, the sparse matrix needs to be - semi-positive. - - Returns - ------- - Xt : ndarray or sparse matrix, shape (n_samples, n_features) - The projected data. - """ + def _check_inputs_transform(self, X): + """Private function to check the inputs before transforming""" X = check_array(X, accept_sparse='csc', copy=True, dtype=[np.float64, np.float32]) # we only accept positive sparse matrix when ignore_implicit_zeros is @@ -2239,18 +2204,39 @@ def transform(self, X): (sparse.issparse(X) and np.any(X.data < 0))): raise ValueError('QuantileTransformer only accepts non-negative' ' sparse matrices') - check_is_fitted(self, '_f_transform') + check_is_fitted(self, 'quantiles_') # check that the dimension of X are adequate with the fitted data - if X.shape[1] != len(self._f_transform): + if X.shape[1] != self.quantiles_.shape[1]: raise ValueError('X does not have the same number of feature than' ' the previously fitted data. Got {} instead of' - ' {}'.format(X.shape[1], len(self._f_transform))) + ' {}'.format(X.shape[1], + self.quantiles_.shape[1])) # check the output PDF if self.output_distribution not in ('norm', 'uniform'): raise ValueError("'output_distribution' has to be either 'norm' or" " 'uniform'. Got {} instead.".format( self.output_distribution)) + return X + + def transform(self, X): + """Feature-wise transformation of the data. + + Parameters + ---------- + X : ndarray or sparse matrix, shape (n_samples, n_features) + The data to be normalized along the features axis. If a sparse + matrix is provided, it will be converted into a sparse + ``csc_matrix``. Additionally, the sparse matrix needs to be + semi-positive. + + Returns + ------- + Xt : ndarray or sparse matrix, shape (n_samples, n_features) + The projected data. + """ + X = self._check_inputs_transform(X) + if sparse.issparse(X): return self._sparse_transform(X, inverse=False) else: @@ -2270,46 +2256,13 @@ def inverse_transform(self, X): Xt : ndarray or sparse matrix, shape (n_samples, n_features) The projected data. """ - X = check_array(X, accept_sparse='csc') - # we only accept positive sparse matrix when ignore_implicit_zeros is - # false - if (not self.ignore_implicit_zeros and - (sparse.issparse(X) and np.any(X.data < 0))): - raise ValueError('QuantileTransformer only accepts non-negative' - ' sparse matrices') - check_is_fitted(self, '_f_inverse_transform') - # check that the dimension of X are adequate with the fitted data - if X.shape[1] != len(self._f_inverse_transform): - raise ValueError('X does not have the same number of feature than' - ' the previously fitted data. Got {} instead of' - ' {}'.format(X.shape[1], - len(self._f_inverse_transform))) - # check the output PDF - if self.output_distribution not in ('norm', 'uniform'): - raise ValueError("'output_distribution' has to be either 'norm' or" - " 'uniform'. Got {} instead.".format( - self.output_distribution)) + X = self._check_inputs_transform(X) + if sparse.issparse(X): return self._sparse_transform(X, inverse=True) else: return self._dense_transform(X, inverse=True) - def __getstate__(self): - """Pickle-protocol - return state of the estimator. """ - state = super(QuantileTransformer, self).__getstate__() - # remove interpolation method - state.pop('_f_transform', None) - state.pop('_f_inverse_transform', None) - return state - - def __setstate__(self, state): - """Pickle-protocol - set state of the estimator. - We need to rebuild the interpolation function. - """ - super(QuantileTransformer, self).__setstate__(state) - if hasattr(self, 'quantiles_'): - self._build_f() - def quantile_transform(X, axis=0, n_quantiles=1000, subsample=int(1e5), ignore_implicit_zeros=False, random_state=None): diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index 493a4e77256be..f49a2330b1728 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -853,7 +853,7 @@ def test_robust_scaler_iris_quantiles(): assert_array_almost_equal(q_range, 1) -def test_quantile_transformr_iris(): +def test_quantile_transform_iris(): X = iris.data normalizer = QuantileTransformer() X_trans = normalizer.fit_transform(X) @@ -863,7 +863,7 @@ def test_quantile_transformr_iris(): assert_array_almost_equal(X, X_trans_inv) -def test_quantile_transformr_check_error(): +def test_quantile_transform_check_error(): X = np.array([[0, 25, 50, 0, 0, 0, 75, 0, 0, 100], [2, 4, 0, 0, 6, 8, 0, 10, 0, 0], [0, 0, 2.6, 4.1, 0, 0, 2.3, 0, 9.5, 0.1]]).T @@ -908,7 +908,7 @@ def test_quantile_transformr_check_error(): X).inverse_transform, X) -def test_quantile_transformr_ignore_zeros(): +def test_quantile_transform_ignore_zeros(): X = np.array([[0, 0, 0, 0, 0], [1, 0, 2, 2, 1]]).T X_sparse = sparse.csc_matrix(X) @@ -959,7 +959,7 @@ def test_quantile_transformr_ignore_zeros(): assert_almost_equal(X_sparse.A, nq.inverse_transform(X_trans).A) -def test_quantile_transformr_dense_toy(): +def test_quantile_transform_dense_toy(): X = np.array([[0, 25, 50, 75, 100], [2, 4, 6, 8, 10], [2.6, 4.1, 2.3, 9.5, 0.1]]).T @@ -993,7 +993,7 @@ def test_quantile_transformr_dense_toy(): X_trans = normalizer.fit_transform(X) -def test_quantile_transformr_sparse_toy(): +def test_quantile_transform_sparse_toy(): X = np.array([[0, 25, 50, 0, 0, 0, 75, 0, 0, 100], [2, 4, 0, 0, 6, 8, 0, 10, 0, 0], [0, 0, 2.6, 4.1, 0, 0, 2.3, 0, 9.5, 0.1]]).T @@ -1062,7 +1062,7 @@ def test_qunatile_normalzer_bounds(): assert_array_almost_equal(X_trans, X1) -def test_quantile_transformr_pickling(): +def test_quantile_transform_pickling(): qn = QuantileTransformer(n_quantiles=100) qn_ser = pickle.dumps(qn, pickle.HIGHEST_PROTOCOL) diff --git a/sklearn/utils/fixes.py b/sklearn/utils/fixes.py index ceb43fee2a8a0..d789d5f525cd4 100644 --- a/sklearn/utils/fixes.py +++ b/sklearn/utils/fixes.py @@ -362,19 +362,6 @@ def rankdata(a, method='average'): from scipy.stats import rankdata -if sp_version < (0, 14, 0): - # Before scipy 0.14.0, interp1d does not accept any assume_sorted - # argument. - def interp1d(x, y, kind='linear', axis=-1, - copy=True, bounds_error=True, - fill_value=np.nan, assume_sorted=False): - - return scipy.interpolate.interp1d(x, y, kind, axis, copy, - bounds_error, fill_value) -else: - from scipy.interpolate import interp1d - - if np_version < (1, 12): class MaskedArray(np.ma.MaskedArray): # Before numpy 1.12, np.ma.MaskedArray object is not picklable From c132211a03d12d6f8f729abece40b29a1d9410c0 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 1 Mar 2017 20:57:02 +0100 Subject: [PATCH 058/106] FIX/TST fix tests --- doc/whats_new.rst | 1 + sklearn/preprocessing/data.py | 24 ++++++- sklearn/preprocessing/tests/test_data.py | 92 ++++++++++++------------ 3 files changed, 72 insertions(+), 45 deletions(-) diff --git a/doc/whats_new.rst b/doc/whats_new.rst index 7e5e0bb300fc1..0ada780eb1b02 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -57,6 +57,7 @@ New features during the first epochs of ridge and logistic regression. By `Arthur Mensch`_. - Added :class:`preprocessing.QuantileNormalizer` class for features + - Added :class:`preprocessing.QuantileTransformer` class for features normalization based on quantiles. :issue:`8363` by :user:`Denis Engemann `, :user:`Guillaume Lemaitre `, `Olivier Grisel`_, diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index f022b79b4cb79..5b7314f6416ac 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -1993,7 +1993,8 @@ def _dense_fit(self, X): # subsample the matrix X if necessary n_samples, n_features = X.shape if self.subsample < n_samples: - subsample_idx = rng.permutation(range(n_samples))[:self.subsample] + subsample_idx = choice(range(n_samples), size=self.subsample, + replace=False, random_state=rng) else: subsample_idx = slice(None) @@ -2089,11 +2090,32 @@ def fit(self, X, y=None): raise ValueError('QuantileTransformer only accepts non-negative' ' sparse matrices') + # check the number of quantiles is less than the number of samples used + # in fitting + if (self.n_quantiles > min(X.shape[0], self.subsample)): + raise ValueError('The number of quantiles is less than either the' + ' number which will be used during fitting. Got' + ' {} quantiles for {} samples.'.format( + self.n_quantiles, min(X.shape[0], + self.subsample))) + if sparse.issparse(X): self._sparse_fit(X) else: self._dense_fit(X) + # check that the quantiles are strictly monotonically increasing + for feature_idx, quantile in enumerate(self.quantiles_): + if not np.all(np.diff(quantile) > 0): + warnings.warn('The feature values corresponding to the' + ' quantiles computed are not strictly' + ' monotonically increasing for the feature #' + '{}. This configuration is ill-posed for the' + ' QuantileTransformer. If this feature is' + ' a categorical feature, be aware that this' + ' transformation will not work.'.format( + feature_idx)) + return self def _transform_col(self, X_col, feature_idx, inverse): diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index f49a2330b1728..8ebdac05ee017 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -855,11 +855,11 @@ def test_robust_scaler_iris_quantiles(): def test_quantile_transform_iris(): X = iris.data - normalizer = QuantileTransformer() - X_trans = normalizer.fit_transform(X) + transformer = QuantileTransformer(n_quantiles=30) + X_trans = transformer.fit_transform(X) assert_array_almost_equal(np.min(X_trans, axis=0), 0.) assert_array_almost_equal(np.max(X_trans, axis=0), 1.) - X_trans_inv = normalizer.inverse_transform(X_trans) + X_trans_inv = transformer.inverse_transform(X_trans) assert_array_almost_equal(X, X_trans_inv) @@ -878,33 +878,35 @@ def test_quantile_transform_check_error(): assert_raises_regex(ValueError, "Invalid value for 'subsample'", QuantileTransformer(subsample=0).fit, X_neg) - normalizer = QuantileTransformer() + transformer = QuantileTransformer(n_quantiles=10) assert_raises_regex(ValueError, "QuantileTransformer only accepts " - "non-negative sparse matrices", normalizer.fit, X_neg) - normalizer.fit(X) + "non-negative sparse matrices", transformer.fit, X_neg) + transformer.fit(X) assert_raises_regex(ValueError, "QuantileTransformer only accepts " "non-negative sparse matrices", - normalizer.transform, X_neg) + transformer.transform, X_neg) assert_raises_regex(ValueError, "QuantileTransformer only accepts " "non-negative sparse matrices", - normalizer.inverse_transform, X_neg) + transformer.inverse_transform, X_neg) X_bad_feat = np.array([[0, 25, 50, 0, 0, 0, 75, 0, 0, 100], [0, 0, 2.6, 4.1, 0, 0, 2.3, 0, 9.5, 0.1]]).T assert_raises_regex(ValueError, "X does not have the same number of " "feature than the previously fitted data.", - normalizer.transform, X_bad_feat) + transformer.transform, X_bad_feat) assert_raises_regex(ValueError, "X does not have the same number of " "feature than the previously fitted data.", - normalizer.inverse_transform, X_bad_feat) + transformer.inverse_transform, X_bad_feat) assert_raises_regex(ValueError, "'output_distribution' has to be either" " 'norm' or 'uniform'. Got rnd instead.", QuantileTransformer( + n_quantiles=10, output_distribution='rnd').fit_transform, X) assert_raises_regex(ValueError, "'output_distribution' has to be either" " 'norm' or 'uniform'. Got rnd instead.", - QuantileTransformer(output_distribution='rnd').fit( + QuantileTransformer(n_quantiles=10, + output_distribution='rnd').fit( X).inverse_transform, X) @@ -912,16 +914,17 @@ def test_quantile_transform_ignore_zeros(): X = np.array([[0, 0, 0, 0, 0], [1, 0, 2, 2, 1]]).T X_sparse = sparse.csc_matrix(X) - nq = QuantileTransformer(ignore_implicit_zeros=True, n_quantiles=5) + transformer = QuantileTransformer(ignore_implicit_zeros=True, + n_quantiles=5) # dense case -> warning raise assert_warns_message(UserWarning, "'ignore_implicit_zeros' takes effect" " only with sparse matrix. This parameter has no" - " effect.", nq.fit, X) + " effect.", transformer.fit, X) X_gt = np.array([[0, 0, 0, 0, 0], [0, 0, 1, 1, 0]]).T - X_trans = nq.fit_transform(X_sparse) + X_trans = transformer.fit_transform(X_sparse) assert_almost_equal(X_gt, X_trans.A) # consider the case where sparse entries are missing values and user-given @@ -930,7 +933,7 @@ def test_quantile_transform_ignore_zeros(): X_col = np.array([0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]) X_row = np.array([0, 4, 0, 1, 2, 3, 4, 5, 6, 7, 8]) X_sparse = sparse.csc_matrix((X_data, (X_row, X_col))) - X_trans = nq.fit_transform(X_sparse) + X_trans = transformer.fit_transform(X_sparse) X_gt = np.array([[0., 0.5], [0., 0.], [0., 1.], @@ -942,12 +945,13 @@ def test_quantile_transform_ignore_zeros(): [0., 0.]]) assert_almost_equal(X_gt, X_trans.A) - nq = QuantileTransformer(ignore_implicit_zeros=True, n_quantiles=5) + transformer = QuantileTransformer(ignore_implicit_zeros=True, + n_quantiles=5) X_data = np.array([-1, -1, 1, 0, 0, 0, 1, -1, 1]) X_col = np.array([0, 0, 1, 1, 1, 1, 1, 1, 1]) X_row = np.array([0, 4, 0, 1, 2, 3, 4, 5, 6]) X_sparse = sparse.csc_matrix((X_data, (X_row, X_col))) - X_trans = nq.fit_transform(X_sparse) + X_trans = transformer.fit_transform(X_sparse) X_gt = np.array([[0, 1], [0, 0.5], [0, 0.5], @@ -956,7 +960,7 @@ def test_quantile_transform_ignore_zeros(): [0, 0], [0, 1]]) assert_almost_equal(X_gt, X_trans.A) - assert_almost_equal(X_sparse.A, nq.inverse_transform(X_trans).A) + assert_almost_equal(X_sparse.A, transformer.inverse_transform(X_trans).A) def test_quantile_transform_dense_toy(): @@ -964,10 +968,10 @@ def test_quantile_transform_dense_toy(): [2, 4, 6, 8, 10], [2.6, 4.1, 2.3, 9.5, 0.1]]).T - normalizer = QuantileTransformer() - normalizer.fit(X) + transformer = QuantileTransformer(n_quantiles=5) + transformer.fit(X) - X_trans = normalizer.fit_transform(X) + X_trans = transformer.fit_transform(X) assert_almost_equal(np.min(X_trans, axis=0), 0.) assert_almost_equal(np.max(X_trans, axis=0), 1.) @@ -979,18 +983,18 @@ def test_quantile_transform_dense_toy(): [0, 0, 0], [1, 1, 1], ]) - assert_array_almost_equal(normalizer.transform(X_test), expected) + assert_array_almost_equal(transformer.transform(X_test), expected) - X_trans_inv = normalizer.inverse_transform(X_trans) + X_trans_inv = transformer.inverse_transform(X_trans) assert_array_almost_equal(X, X_trans_inv) # test subsampling # FIXME: there is not comparison for the moment random_state = 42 - normalizer.set_params(**{'subsample': 3, - 'n_quantiles': 2, - 'random_state': random_state}) - X_trans = normalizer.fit_transform(X) + transformer.set_params(**{'subsample': 3, + 'n_quantiles': 2, + 'random_state': random_state}) + X_trans = transformer.fit_transform(X) def test_quantile_transform_sparse_toy(): @@ -1000,32 +1004,32 @@ def test_quantile_transform_sparse_toy(): X = sparse.csc_matrix(X) - normalizer = QuantileTransformer() - normalizer.fit(X) + transformer = QuantileTransformer(n_quantiles=10) + transformer.fit(X) - X_trans = normalizer.fit_transform(X) + X_trans = transformer.fit_transform(X) assert_array_almost_equal(np.min(X_trans.toarray(), axis=0), 0.) assert_array_almost_equal(np.max(X_trans.toarray(), axis=0), 1.) - X_trans_inv = normalizer.inverse_transform(X_trans) + X_trans_inv = transformer.inverse_transform(X_trans) assert_array_almost_equal(X.toarray(), X_trans_inv.toarray()) - normalizer_dense = QuantileTransformer().fit(X.toarray()) + transformer_dense = QuantileTransformer(n_quantiles=10).fit(X.toarray()) - X_trans = normalizer_dense.transform(X) + X_trans = transformer_dense.transform(X) assert_array_almost_equal(np.min(X_trans.toarray(), axis=0), 0.) assert_array_almost_equal(np.max(X_trans.toarray(), axis=0), 1.) - X_trans_inv = normalizer_dense.inverse_transform(X_trans) + X_trans_inv = transformer_dense.inverse_transform(X_trans) assert_array_almost_equal(X.toarray(), X_trans_inv.toarray()) # test subsampling # FIXME: there is not comparison for the moment random_state = 42 - normalizer.set_params(**{'subsample': 3, - 'n_quantiles': 2, - 'random_state': random_state}) - X_trans = normalizer.fit_transform(X) + transformer.set_params(**{'subsample': 3, + 'n_quantiles': 2, + 'random_state': random_state}) + X_trans = transformer.fit_transform(X) def test_quantile_transform_axis1(): @@ -1033,21 +1037,21 @@ def test_quantile_transform_axis1(): [2, 4, 6, 8, 10], [2.6, 4.1, 2.3, 9.5, 0.1]]) - X_trans_a0 = quantile_transform(X.T, axis=0) - X_trans_a1 = quantile_transform(X, axis=1) + X_trans_a0 = quantile_transform(X.T, axis=0, n_quantiles=5) + X_trans_a1 = quantile_transform(X, axis=1, n_quantiles=5) assert_array_almost_equal(X_trans_a0, X_trans_a1.T) -def test_qunatile_normalzer_bounds(): +def test_qunatile_transform_bounds(): X_dense = np.array([[0, 0], [0, 0], [1, 0]]) X_sparse = sparse.csc_matrix(X_dense) # check sparse and dense are consistent - X_trans = QuantileTransformer().fit_transform(X_dense) + X_trans = QuantileTransformer(n_quantiles=3).fit_transform(X_dense) assert_array_almost_equal(X_trans, X_dense) - X_trans_sp = QuantileTransformer().fit_transform(X_sparse) + X_trans_sp = QuantileTransformer(n_quantiles=3).fit_transform(X_sparse) assert_array_almost_equal(X_trans_sp.A, X_dense) assert_array_almost_equal(X_trans, X_trans_sp.A) @@ -1057,7 +1061,7 @@ def test_qunatile_normalzer_bounds(): [1, 0.5, 0]]).T X1 = np.array([[0, 0, 1], [0.1, 0.5, 0.1]]).T - qn = QuantileTransformer().fit(X) + qn = QuantileTransformer(n_quantiles=3).fit(X) X_trans = qn.transform(X1) assert_array_almost_equal(X_trans, X1) From 9b66d71aa0299218aace44585db0da2fe922e21a Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 2 Mar 2017 19:38:25 +0100 Subject: [PATCH 059/106] DOC start checking doc --- doc/modules/preprocessing.rst | 34 +++++++++++ sklearn/preprocessing/data.py | 110 ++++++++++++++++++++++++++++++---- 2 files changed, 133 insertions(+), 11 deletions(-) diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index 709239687158e..ea06c5d6b0b16 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -248,6 +248,40 @@ a :class:`KernelCenterer` can transform the kernel matrix so that it contains inner products in the feature space defined by :math:`phi` followed by removal of the mean in that space. +.. _preprocessing_transformer: + +Non-linear transformation +========================= + +In the contrary of scaling, data can be non-linearly transformed to reduce the +influence of marginal outliers present in a dataset. Additionally, reducing the +influence of those outliers allows for a more direct comparison between +features, at the cost of distorting correlations between them. + +:class:`QuantileTransformer` and :func:`quantile_transform` provide a +non-parametric transformation based the quantile function to map the data to a +uniform distribution:: + + >>> from sklearn.datasets import load_iris + >>> iris = load_iris() + >>> X, y = iris.data, iris.target + >>> quantile_transformer = preprocessing.QuantileTransformer() + >>> X_trans = quantile_transformer.fit_transform(iris.data) + +It is also possible to map the transformed data to a normal distribution by +setting ``output_distribution='norm'``:: + + >>> X_trans = preprocessing.quantile_transform(X, output_distribution='norm') + +.. topic:: Sparse input + + :class:`QuantileTransformer` and :func:`quantile_transform` accept **both + dense array-like and sparse matrices from scipy.sparse as input**. + + For sparse input the data is **converted to the Compressed Sparse Columns + representation** (see ``scipy.sparse.csc_matrix``). To avoid unnecessary + memory copies, it is recommended to choose the CSC representation upstream. + .. _preprocessing_normalization: Normalization diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 5b7314f6416ac..fdb0aba01544a 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -1923,10 +1923,12 @@ class QuantileTransformer(BaseEstimator, TransformerMixin): The transformation is applied on each feature independently. The cumulative density function of a feature is used to project the original values. Features values of new/unseen data that fall below - or above the fitted range will be mapped to 0 and 1, respectively. - Note that this transform is non-linear. It may distort linear correlations - between variables measured at the same scale but renders variables measured - at different scales more directly comparable. + or above the fitted range will be mapped to the bounds of the output + distribution. Note that this transform is non-linear. It may distort linear + correlations between variables measured at the same scale but renders + variables measured at different scales more directly comparable. + + Read more in the :ref:`User Guide `. Parameters ---------- @@ -1957,8 +1959,16 @@ class QuantileTransformer(BaseEstimator, TransformerMixin): quantiles_ : ndarray, shape (n_quantiles, n_features) The values corresponding the quantiles of reference. + Examples + -------- + >>> from sklearn.datasets import load_iris + >>> from sklearn.preprocessing import QuantileTransformer + >>> iris = load_iris() + >>> X_trans = QuantileTransformer(n_quantiles=20).fit_transform(iris.data) + See also -------- + quantile_transform : Equivalent function without the object oriented API. StandardScaler : perform standardization that is faster, but less robust to outliers. @@ -2064,7 +2074,7 @@ def fit(self, X, y=None): The data used to scale along the features axis. If a sparse matrix is provided, it will be converted into a sparse ``csc_matrix``. Additionally, the sparse matrix needs to be - semi-positive. + semi-positive if `ignore_implicit_zeros` is False. Returns ------- @@ -2247,10 +2257,10 @@ def transform(self, X): Parameters ---------- X : ndarray or sparse matrix, shape (n_samples, n_features) - The data to be normalized along the features axis. If a sparse + The data used to scale along the features axis. If a sparse matrix is provided, it will be converted into a sparse ``csc_matrix``. Additionally, the sparse matrix needs to be - semi-positive. + semi-positive if `ignore_implicit_zeros` is False. Returns ------- @@ -2268,10 +2278,10 @@ def inverse_transform(self, X): """Back-projection to the original space. X : ndarray or sparse matrix, shape (n_samples, n_features) - The data to be normalized along the features axis. If a sparse + The data used to scale along the features axis. If a sparse matrix is provided, it will be converted into a sparse ``csc_matrix``. Additionally, the sparse matrix needs to be - semi-positive. + semi-positive if `ignore_implicit_zeros` is False. Returns ------- @@ -2286,8 +2296,86 @@ def inverse_transform(self, X): return self._dense_transform(X, inverse=True) -def quantile_transform(X, axis=0, n_quantiles=1000, subsample=int(1e5), - ignore_implicit_zeros=False, random_state=None): +def quantile_transform(X, axis=0, n_quantiles=1000, + output_distribution='uniform', + ignore_implicit_zeros=False, + subsample=int(1e5), + random_state=None): + """Transform features using quantiles information. + + This method scales the features to follow a uniform or a normal + distribution. Therefore, for a given feature, this transformation tends + to spread out the most frequent values. It also reduces the impact of + (marginal) outliers: this is therefore a robust preprocessing scheme. + + The transformation is applied on each feature independently. + The cumulative density function of a feature is used to project the + original values. Features values of new/unseen data that fall below + or above the fitted range will be mapped to the bounds of the output + distribution. Note that this transform is non-linear. It may distort linear + correlations between variables measured at the same scale but renders + variables measured at different scales more directly comparable. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + X : ndarray or sparse matrix, shape (n_samples, n_features) + The data used to scale along the features axis. If a sparse + matrix is provided, it will be converted into a sparse + ``csc_matrix``. Additionally, the sparse matrix needs to be + semi-positive if `ignore_implicit_zeros` is False. + + axis : 0 or 1, optional (0 by default) + axis used to normalize the data along. If 1, independently normalize + each sample, otherwise (if 0) normalize each feature. + + n_quantiles : int, optional (default=1000) + Number of quantiles to be computed. It corresponds to the number + of landmarks used to discretize the cumulative density function. + + output_distribution : str, optional (default='uniform') + Marginal distribution for the transformed data. The choices are + 'uniform' (default) or 'norm'. + + ignore_implicit_zeros : bool, optional (default=False) + Apply only for sparse matrices. If True, the sparse entries of the + matrix are discarded to compute the quantile statistics. If false, + these entries are accounting for zeros. + + subsample : int, optional (default=1e5) + Maximum number of samples used to estimate the quantiles. + + random_state : int, RandomState instance or None, optional (default=None) + If int, random_state is the seed used by the random number generator; + If RandomState instance, random_state is the random number generator; + If None, the random number generator is the RandomState instance used + by np.random. + + Attributes + ---------- + quantiles_ : ndarray, shape (n_quantiles, n_features) + The values corresponding the quantiles of reference. + + Examples + -------- + >>> from sklearn.datasets import load_iris + >>> from sklearn.preprocessing import quantile_transform + >>> iris = load_iris() + >>> X_trans = quantile_transform(iris.data, n_quantiles=20) + + See also + -------- + QuantileTransformer : Performs quantile-based scaling using the + ``Transformer`` API (e.g. as part of a preprocessing + :class:`sklearn.pipeline.Pipeline`). + + scale : perform standardization that is faster, but less robust + to outliers. + + robust_scale : perform robust standardization that removes the influence + of outliers but does not put outliers and inliers on the same scale. + """ n = QuantileTransformer(n_quantiles=n_quantiles, subsample=subsample, ignore_implicit_zeros=ignore_implicit_zeros, random_state=random_state) From fb88fa1e8ef1d8489d7002783011a438670f7434 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 14 Mar 2017 17:29:18 +0100 Subject: [PATCH 060/106] TST add test to check the behaviour of interp numpy --- sklearn/preprocessing/data.py | 13 ++----------- sklearn/preprocessing/tests/test_data.py | 23 +++++++++++++++++++++++ 2 files changed, 25 insertions(+), 11 deletions(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index fdb0aba01544a..f395f74f24208 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -1956,7 +1956,7 @@ class QuantileTransformer(BaseEstimator, TransformerMixin): Attributes ---------- - quantiles_ : ndarray, shape (n_quantiles, n_features) + quantiles_ : ndarray, shape (n_quantiles_, n_features) The values corresponding the quantiles of reference. Examples @@ -2100,15 +2100,6 @@ def fit(self, X, y=None): raise ValueError('QuantileTransformer only accepts non-negative' ' sparse matrices') - # check the number of quantiles is less than the number of samples used - # in fitting - if (self.n_quantiles > min(X.shape[0], self.subsample)): - raise ValueError('The number of quantiles is less than either the' - ' number which will be used during fitting. Got' - ' {} quantiles for {} samples.'.format( - self.n_quantiles, min(X.shape[0], - self.subsample))) - if sparse.issparse(X): self._sparse_fit(X) else: @@ -2354,7 +2345,7 @@ def quantile_transform(X, axis=0, n_quantiles=1000, Attributes ---------- - quantiles_ : ndarray, shape (n_quantiles, n_features) + quantiles_ : ndarray, shape (n_quantiles_, n_features) The values corresponding the quantiles of reference. Examples diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index 8ebdac05ee017..0530d185ec388 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -1081,6 +1081,29 @@ def test_quantile_transform_pickling(): qn2.transform(iris.data)) +def test_quantile_transform_numpy_interp_behaviour(): + # The quantile transformer relies on the numpy implementation of + # 'interp' function. In the presence of a predominant constant + # feature values or a large number of quantiles, a single feature + # value is mapped to different quantiles. The default behaviour of + # 'interp' will be to return the larger quantile associated to the + # feature value. This test attends to check if there is any + # changes in the 'interp' function and to act accordingly. + unique_feature = [0, 0.5, 1] + X = np.transpose([[unique_feature[0]] * 1 + + [unique_feature[1]] * 7 + + [unique_feature[2]] * 2]) + qt = QuantileTransformer(n_quantiles=100) + qt.fit(X) + ref = np.linspace(0., 1., num=qt.n_quantiles) + max_quantiles_idx = [np.flatnonzero(qt.quantiles_ == unique_feature[i])[-1] + for i in range(len(unique_feature))] + X_trans = np.transpose([[ref[max_quantiles_idx[0]]] * 1 + + [ref[max_quantiles_idx[1]]] * 7 + + [ref[max_quantiles_idx[2]]] * 2]) + assert_array_almost_equal(qt.transform(X), X_trans) + + def test_robust_scaler_invalid_range(): for range_ in [ (-1, 90), From f46aea9a1acbd4b42315f62db6d54da516b81e4c Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 14 Mar 2017 17:50:00 +0100 Subject: [PATCH 061/106] TST/EHN Add the possibility to add noise to compute quantile --- sklearn/preprocessing/data.py | 54 +++++++++++++++++------- sklearn/preprocessing/tests/test_data.py | 48 +++++++++++++++------ 2 files changed, 72 insertions(+), 30 deletions(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index f395f74f24208..2000ee15ed261 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -1948,6 +1948,11 @@ class QuantileTransformer(BaseEstimator, TransformerMixin): subsample : int, optional (default=1e5) Maximum number of samples used to estimate the quantiles. + noise_variance : None or float, optional (default=None) + Variance of the noise which will be added to the subsamples to + compute the corresponding quantiles. This parameter can be + useful if there a feature value is predominant. + random_state : int, RandomState instance or None, optional (default=None) If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; @@ -1979,11 +1984,12 @@ class QuantileTransformer(BaseEstimator, TransformerMixin): def __init__(self, n_quantiles=1000, output_distribution='uniform', ignore_implicit_zeros=False, subsample=int(1e5), - random_state=None): + noise_variance=None, random_state=None): self.n_quantiles = n_quantiles self.output_distribution = output_distribution self.ignore_implicit_zeros = ignore_implicit_zeros self.subsample = subsample + self.noise_variance = noise_variance self.random_state = random_state def _dense_fit(self, X): @@ -2008,12 +2014,19 @@ def _dense_fit(self, X): else: subsample_idx = slice(None) + n_subsample = min(n_samples, self.subsample) + if self.noise_variance is None: + noise = np.zeros((n_subsample, )) + else: + noise = rng.normal(0, self.noise_variance, + size=(n_subsample,)) + # for compatibility issue with numpy<=1.8.X, references # need to be a list scaled between 0 and 100 references = np.linspace(0, 100, self.n_quantiles, endpoint=True).tolist() self.quantiles_ = np.transpose( - [np.percentile(X[subsample_idx, feature_idx], references) + [np.percentile(X[subsample_idx, feature_idx] + noise, references) for feature_idx in range(n_features)]) def _sparse_fit(self, X): @@ -2056,13 +2069,21 @@ def _sparse_fit(self, X): else: column_data = np.zeros(shape=n_samples, dtype=X.dtype) column_data[:len(column_nnz_data)] = column_nnz_data + + n_subsample = column_data.size + if self.noise_variance is None: + noise = np.zeros((n_subsample, )) + else: + noise = rng.normal(0, self.noise_variance, + size=(n_subsample,)) + if not column_data.size: # if no nnz, an error will be raised for computing the # quantiles. Force the quantiles to be zeros. self.quantiles_.append([0] * len(references)) else: self.quantiles_.append( - np.percentile(column_data, references)) + np.percentile(column_data + noise, references)) self.quantiles_ = np.transpose(self.quantiles_) def fit(self, X, y=None): @@ -2090,9 +2111,15 @@ def fit(self, X, y=None): if self.subsample <= 0: raise ValueError("Invalid value for 'subsample': %d. " - "The number of quantiles must be at least one." + "The number of subsamples must be at least one." % self.subsample) + if self.noise_variance is not None: + if self.noise_variance <= 0: + raise ValueError("Invalid value for 'noise_variance': %d. " + "The noise variance should be greater than 0." + % self.noise_variance) + # we only accept positive sparse matrix when ignore_implicit_zeros is # false if (not self.ignore_implicit_zeros and @@ -2105,18 +2132,6 @@ def fit(self, X, y=None): else: self._dense_fit(X) - # check that the quantiles are strictly monotonically increasing - for feature_idx, quantile in enumerate(self.quantiles_): - if not np.all(np.diff(quantile) > 0): - warnings.warn('The feature values corresponding to the' - ' quantiles computed are not strictly' - ' monotonically increasing for the feature #' - '{}. This configuration is ill-posed for the' - ' QuantileTransformer. If this feature is' - ' a categorical feature, be aware that this' - ' transformation will not work.'.format( - feature_idx)) - return self def _transform_col(self, X_col, feature_idx, inverse): @@ -2291,6 +2306,7 @@ def quantile_transform(X, axis=0, n_quantiles=1000, output_distribution='uniform', ignore_implicit_zeros=False, subsample=int(1e5), + noise_variance=None, random_state=None): """Transform features using quantiles information. @@ -2337,6 +2353,11 @@ def quantile_transform(X, axis=0, n_quantiles=1000, subsample : int, optional (default=1e5) Maximum number of samples used to estimate the quantiles. + noise_variance : None or float, optional (default=None) + Variance of the noise which will be added to the subsamples to + compute the corresponding quantiles. This parameter can be + useful if there a feature value is predominant. + random_state : int, RandomState instance or None, optional (default=None) If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; @@ -2369,6 +2390,7 @@ def quantile_transform(X, axis=0, n_quantiles=1000, """ n = QuantileTransformer(n_quantiles=n_quantiles, subsample=subsample, ignore_implicit_zeros=ignore_implicit_zeros, + noise_variance=noise_variance, random_state=random_state) if axis == 0: return n.fit_transform(X) diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index 0530d185ec388..43f3c1efa06e5 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -877,6 +877,8 @@ def test_quantile_transform_check_error(): QuantileTransformer(n_quantiles=0).fit, X_neg) assert_raises_regex(ValueError, "Invalid value for 'subsample'", QuantileTransformer(subsample=0).fit, X_neg) + assert_raises_regex(ValueError, "Invalid value for 'noise_variance'", + QuantileTransformer(noise_variance=0).fit, X_neg) transformer = QuantileTransformer(n_quantiles=10) assert_raises_regex(ValueError, "QuantileTransformer only accepts " @@ -1061,24 +1063,42 @@ def test_qunatile_transform_bounds(): [1, 0.5, 0]]).T X1 = np.array([[0, 0, 1], [0.1, 0.5, 0.1]]).T - qn = QuantileTransformer(n_quantiles=3).fit(X) - X_trans = qn.transform(X1) + transformer = QuantileTransformer(n_quantiles=3).fit(X) + X_trans = transformer.transform(X1) assert_array_almost_equal(X_trans, X1) def test_quantile_transform_pickling(): - qn = QuantileTransformer(n_quantiles=100) - - qn_ser = pickle.dumps(qn, pickle.HIGHEST_PROTOCOL) - qn2 = pickle.loads(qn_ser) - assert_false(hasattr(qn2, 'f_transform_')) - assert_false(hasattr(qn2, 'f_inverse_transform_')) - - qn.fit(iris.data) - qn_ser = pickle.dumps(qn, pickle.HIGHEST_PROTOCOL) - qn2 = pickle.loads(qn_ser) - assert_array_almost_equal(qn.transform(iris.data), - qn2.transform(iris.data)) + transformer = QuantileTransformer(n_quantiles=100) + + transformer_ser = pickle.dumps(transformer, pickle.HIGHEST_PROTOCOL) + transformer2 = pickle.loads(transformer_ser) + assert_false(hasattr(transformer2, 'f_transform_')) + assert_false(hasattr(transformer2, 'f_inverse_transform_')) + + transformer.fit(iris.data) + transformer_ser = pickle.dumps(transformer, pickle.HIGHEST_PROTOCOL) + transformer2 = pickle.loads(transformer_ser) + assert_array_almost_equal(transformer.transform(iris.data), + transformer2.transform(iris.data)) + + +def test_quantile_transform_add_noise_subsamples(): + # toy examples + unique_feature = [0, 0.5, 1] + X = np.transpose([[unique_feature[0]] * 1 + + [unique_feature[1]] * 7 + + [unique_feature[2]] * 2]) + transformer = QuantileTransformer(n_quantiles=100, noise_variance=1e-7) + transformer.fit(X) + assert_true(np.all(np.diff(transformer.quantiles_) > 0)) + # iris dataset + X = iris.data + transformer = QuantileTransformer(n_quantiles=1000, noise_variance=1e-7) + X_trans = transformer.fit_transform(X) + X_trans_inv = transformer.inverse_transform(X_trans) + assert_array_almost_equal(X, X_trans_inv) + assert_true(np.all(np.diff(transformer.quantiles_, axis=0) > 0)) def test_quantile_transform_numpy_interp_behaviour(): From d55295a6f381f0c9c88b68c090e3e82a02738298 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 14 Mar 2017 18:29:21 +0100 Subject: [PATCH 062/106] FIX factorize quantile computation --- sklearn/preprocessing/data.py | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 2000ee15ed261..c165ea52c1e30 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -1992,6 +1992,19 @@ def __init__(self, n_quantiles=1000, output_distribution='uniform', self.noise_variance = noise_variance self.random_state = random_state + def _compute_quantile_one_column(self, X_col, references): + """Private function to compute the quantiles for one features.""" + rng = check_random_state(self.random_state) + + if self.noise_variance is None: + noise = np.zeros(X_col.shape) + else: + noise = rng.normal(0, self.noise_variance, size=X_col.shape) + + quantile = np.percentile(X_col + noise, references) + + return quantile + def _dense_fit(self, X): """Compute percentiles for dense matrices. @@ -2014,19 +2027,13 @@ def _dense_fit(self, X): else: subsample_idx = slice(None) - n_subsample = min(n_samples, self.subsample) - if self.noise_variance is None: - noise = np.zeros((n_subsample, )) - else: - noise = rng.normal(0, self.noise_variance, - size=(n_subsample,)) - # for compatibility issue with numpy<=1.8.X, references # need to be a list scaled between 0 and 100 references = np.linspace(0, 100, self.n_quantiles, endpoint=True).tolist() self.quantiles_ = np.transpose( - [np.percentile(X[subsample_idx, feature_idx] + noise, references) + [self._compute_quantile_one_column(X[subsample_idx, feature_idx], + references) for feature_idx in range(n_features)]) def _sparse_fit(self, X): @@ -2070,20 +2077,13 @@ def _sparse_fit(self, X): column_data = np.zeros(shape=n_samples, dtype=X.dtype) column_data[:len(column_nnz_data)] = column_nnz_data - n_subsample = column_data.size - if self.noise_variance is None: - noise = np.zeros((n_subsample, )) - else: - noise = rng.normal(0, self.noise_variance, - size=(n_subsample,)) - if not column_data.size: # if no nnz, an error will be raised for computing the # quantiles. Force the quantiles to be zeros. self.quantiles_.append([0] * len(references)) else: self.quantiles_.append( - np.percentile(column_data + noise, references)) + self._compute_quantile_one_column(column_data, references)) self.quantiles_ = np.transpose(self.quantiles_) def fit(self, X, y=None): From 38127d53c2a8e3dbfa0a879ac83ac49735d4b1fc Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 14 Mar 2017 22:55:11 +0100 Subject: [PATCH 063/106] FIX fixes issues --- sklearn/preprocessing/data.py | 76 +++++++++++------------- sklearn/preprocessing/tests/test_data.py | 8 +-- 2 files changed, 40 insertions(+), 44 deletions(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index c165ea52c1e30..38965693cb8af 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -1948,10 +1948,11 @@ class QuantileTransformer(BaseEstimator, TransformerMixin): subsample : int, optional (default=1e5) Maximum number of samples used to estimate the quantiles. - noise_variance : None or float, optional (default=None) - Variance of the noise which will be added to the subsamples to - compute the corresponding quantiles. This parameter can be - useful if there a feature value is predominant. + smoothing_noise : None or float, optional (default=None) + Standard deviation of the noise which will be added to the + subsamples to compute the corresponding quantiles. This + parameter can be useful if there a feature value is + predominant. random_state : int, RandomState instance or None, optional (default=None) If int, random_state is the seed used by the random number generator; @@ -1984,28 +1985,25 @@ class QuantileTransformer(BaseEstimator, TransformerMixin): def __init__(self, n_quantiles=1000, output_distribution='uniform', ignore_implicit_zeros=False, subsample=int(1e5), - noise_variance=None, random_state=None): + smoothing_noise=None, random_state=None): self.n_quantiles = n_quantiles self.output_distribution = output_distribution self.ignore_implicit_zeros = ignore_implicit_zeros self.subsample = subsample - self.noise_variance = noise_variance + self.smoothing_noise = smoothing_noise self.random_state = random_state - def _compute_quantile_one_column(self, X_col, references): + def _compute_quantile_one_column(self, X_col, references, random_state): """Private function to compute the quantiles for one features.""" - rng = check_random_state(self.random_state) - - if self.noise_variance is None: + if self.smoothing_noise is None: noise = np.zeros(X_col.shape) else: - noise = rng.normal(0, self.noise_variance, size=X_col.shape) + noise = random_state.normal(0, self.smoothing_noise, + size=X_col.shape) - quantile = np.percentile(X_col + noise, references) + return np.percentile(X_col + noise, references) - return quantile - - def _dense_fit(self, X): + def _dense_fit(self, X, random_state): """Compute percentiles for dense matrices. Parameters @@ -2017,13 +2015,11 @@ def _dense_fit(self, X): warnings.warn("'ignore_implicit_zeros' takes effect only with" " sparse matrix. This parameter has no effect.") - rng = check_random_state(self.random_state) - # subsample the matrix X if necessary n_samples, n_features = X.shape if self.subsample < n_samples: subsample_idx = choice(range(n_samples), size=self.subsample, - replace=False, random_state=rng) + replace=False, random_state=random_state) else: subsample_idx = slice(None) @@ -2033,10 +2029,10 @@ def _dense_fit(self, X): endpoint=True).tolist() self.quantiles_ = np.transpose( [self._compute_quantile_one_column(X[subsample_idx, feature_idx], - references) + references, random_state) for feature_idx in range(n_features)]) - def _sparse_fit(self, X): + def _sparse_fit(self, X, random_state): """Compute percentiles for sparse matrices. Parameters @@ -2045,8 +2041,6 @@ def _sparse_fit(self, X): The data used to scale along the features axis. The sparse matrix needs to be semi-positive. """ - rng = check_random_state(self.random_state) - n_samples, n_features = X.shape # for compatibility issue with numpy<=1.8.X, references @@ -2065,10 +2059,9 @@ def _sparse_fit(self, X): dtype=X.dtype) else: column_data = np.zeros(shape=self.subsample, dtype=X.dtype) - column_data[:column_subsample] = choice(column_nnz_data, - size=column_subsample, - replace=False, - random_state=rng) + column_data[:column_subsample] = choice( + column_nnz_data, size=column_subsample, + replace=False, random_state=random_state) else: if self.ignore_implicit_zeros: column_data = np.zeros(shape=len(column_nnz_data), @@ -2083,7 +2076,8 @@ def _sparse_fit(self, X): self.quantiles_.append([0] * len(references)) else: self.quantiles_.append( - self._compute_quantile_one_column(column_data, references)) + self._compute_quantile_one_column(column_data, references, + random_state)) self.quantiles_ = np.transpose(self.quantiles_) def fit(self, X, y=None): @@ -2103,6 +2097,7 @@ def fit(self, X, y=None): Returns self """ X = check_array(X, accept_sparse='csc') + rng = check_random_state(self.random_state) if self.n_quantiles <= 0: raise ValueError("Invalid value for 'n_quantiles': %d. " @@ -2114,11 +2109,11 @@ def fit(self, X, y=None): "The number of subsamples must be at least one." % self.subsample) - if self.noise_variance is not None: - if self.noise_variance <= 0: - raise ValueError("Invalid value for 'noise_variance': %d. " - "The noise variance should be greater than 0." - % self.noise_variance) + if self.smoothing_noise is not None: + if self.smoothing_noise <= 0: + raise ValueError("Invalid value for 'smoothing_noise': %d. " + "The noise std. dev. should be greater than 0." + % self.smoothing_noise) # we only accept positive sparse matrix when ignore_implicit_zeros is # false @@ -2128,9 +2123,9 @@ def fit(self, X, y=None): ' sparse matrices') if sparse.issparse(X): - self._sparse_fit(X) + self._sparse_fit(X, rng) else: - self._dense_fit(X) + self._dense_fit(X, rng) return self @@ -2306,7 +2301,7 @@ def quantile_transform(X, axis=0, n_quantiles=1000, output_distribution='uniform', ignore_implicit_zeros=False, subsample=int(1e5), - noise_variance=None, + smoothing_noise=None, random_state=None): """Transform features using quantiles information. @@ -2353,10 +2348,11 @@ def quantile_transform(X, axis=0, n_quantiles=1000, subsample : int, optional (default=1e5) Maximum number of samples used to estimate the quantiles. - noise_variance : None or float, optional (default=None) - Variance of the noise which will be added to the subsamples to - compute the corresponding quantiles. This parameter can be - useful if there a feature value is predominant. + smoothing_noise : None or float, optional (default=None) + Standard deviation of the noise which will be added to the + subsamples to compute the corresponding quantiles. This + parameter can be useful if there a feature value is + predominant. random_state : int, RandomState instance or None, optional (default=None) If int, random_state is the seed used by the random number generator; @@ -2390,7 +2386,7 @@ def quantile_transform(X, axis=0, n_quantiles=1000, """ n = QuantileTransformer(n_quantiles=n_quantiles, subsample=subsample, ignore_implicit_zeros=ignore_implicit_zeros, - noise_variance=noise_variance, + smoothing_noise=smoothing_noise, random_state=random_state) if axis == 0: return n.fit_transform(X) diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index 43f3c1efa06e5..c4678f50e3890 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -877,8 +877,8 @@ def test_quantile_transform_check_error(): QuantileTransformer(n_quantiles=0).fit, X_neg) assert_raises_regex(ValueError, "Invalid value for 'subsample'", QuantileTransformer(subsample=0).fit, X_neg) - assert_raises_regex(ValueError, "Invalid value for 'noise_variance'", - QuantileTransformer(noise_variance=0).fit, X_neg) + assert_raises_regex(ValueError, "Invalid value for 'smoothing_noise'", + QuantileTransformer(smoothing_noise=0).fit, X_neg) transformer = QuantileTransformer(n_quantiles=10) assert_raises_regex(ValueError, "QuantileTransformer only accepts " @@ -1089,12 +1089,12 @@ def test_quantile_transform_add_noise_subsamples(): X = np.transpose([[unique_feature[0]] * 1 + [unique_feature[1]] * 7 + [unique_feature[2]] * 2]) - transformer = QuantileTransformer(n_quantiles=100, noise_variance=1e-7) + transformer = QuantileTransformer(n_quantiles=100, smoothing_noise=1e-7) transformer.fit(X) assert_true(np.all(np.diff(transformer.quantiles_) > 0)) # iris dataset X = iris.data - transformer = QuantileTransformer(n_quantiles=1000, noise_variance=1e-7) + transformer = QuantileTransformer(n_quantiles=1000, smoothing_noise=1e-7) X_trans = transformer.fit_transform(X) X_trans_inv = transformer.inverse_transform(X_trans) assert_array_almost_equal(X, X_trans_inv) From 90fa3bd4f8f4b9f3dac42e163155e76566497ab3 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 14 Mar 2017 23:55:12 +0100 Subject: [PATCH 064/106] PEP8 --- sklearn/preprocessing/data.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 38965693cb8af..a286bc7dcd9df 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -2112,8 +2112,8 @@ def fit(self, X, y=None): if self.smoothing_noise is not None: if self.smoothing_noise <= 0: raise ValueError("Invalid value for 'smoothing_noise': %d. " - "The noise std. dev. should be greater than 0." - % self.smoothing_noise) + "The noise std. dev. should be greater than " + "0." % self.smoothing_noise) # we only accept positive sparse matrix when ignore_implicit_zeros is # false From ba8339d364d0c49bb91b364bf780bdd5b3629573 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 15 Mar 2017 00:33:47 +0100 Subject: [PATCH 065/106] FIX/DOC correct doc --- sklearn/preprocessing/data.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index a286bc7dcd9df..cc0a61f4d333e 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -1949,10 +1949,9 @@ class QuantileTransformer(BaseEstimator, TransformerMixin): Maximum number of samples used to estimate the quantiles. smoothing_noise : None or float, optional (default=None) - Standard deviation of the noise which will be added to the - subsamples to compute the corresponding quantiles. This - parameter can be useful if there a feature value is - predominant. + Standard deviation of the added noise before computing the + quantiles. This parameter is useful if there is a predominant + feature value predominant. random_state : int, RandomState instance or None, optional (default=None) If int, random_state is the seed used by the random number generator; @@ -2349,10 +2348,9 @@ def quantile_transform(X, axis=0, n_quantiles=1000, Maximum number of samples used to estimate the quantiles. smoothing_noise : None or float, optional (default=None) - Standard deviation of the noise which will be added to the - subsamples to compute the corresponding quantiles. This - parameter can be useful if there a feature value is - predominant. + Standard deviation of the added noise before computing the + quantiles. This parameter is useful if there is a predominant + feature value predominant. random_state : int, RandomState instance or None, optional (default=None) If int, random_state is the seed used by the random number generator; From 9a1b79eb1f2f3d8bac13c72ff729a652dd267104 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 15 Mar 2017 11:50:22 +0100 Subject: [PATCH 066/106] TST/DOC improve doc and add random state --- sklearn/preprocessing/tests/test_data.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index c4678f50e3890..cbd419b77679f 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -1089,15 +1089,23 @@ def test_quantile_transform_add_noise_subsamples(): X = np.transpose([[unique_feature[0]] * 1 + [unique_feature[1]] * 7 + [unique_feature[2]] * 2]) - transformer = QuantileTransformer(n_quantiles=100, smoothing_noise=1e-7) + transformer = QuantileTransformer(n_quantiles=100, smoothing_noise=1e-7, + random_state=0) transformer.fit(X) + # check that the feature values associated to quantiles are + # strictly monitically increasing as suggested by the 'interp' + # function from numpy assert_true(np.all(np.diff(transformer.quantiles_) > 0)) # iris dataset X = iris.data - transformer = QuantileTransformer(n_quantiles=1000, smoothing_noise=1e-7) + transformer = QuantileTransformer(n_quantiles=1000, smoothing_noise=1e-7, + random_state=0) X_trans = transformer.fit_transform(X) X_trans_inv = transformer.inverse_transform(X_trans) assert_array_almost_equal(X, X_trans_inv) + # check that the feature values associated to quantiles are + # strictly monitically increasing as suggested by the 'interp' + # function from numpy assert_true(np.all(np.diff(transformer.quantiles_, axis=0) > 0)) @@ -1106,9 +1114,12 @@ def test_quantile_transform_numpy_interp_behaviour(): # 'interp' function. In the presence of a predominant constant # feature values or a large number of quantiles, a single feature # value is mapped to different quantiles. The default behaviour of - # 'interp' will be to return the larger quantile associated to the + # 'interp' will be returning the larger quantile associated to the # feature value. This test attends to check if there is any - # changes in the 'interp' function and to act accordingly. + # changes in the 'interp' function and to act accordingly. This + # implementation subtilities is mention in the docstring of the + # 'interp' function. + unique_feature = [0, 0.5, 1] X = np.transpose([[unique_feature[0]] * 1 + [unique_feature[1]] * 7 + From dabd403ef563498589138de4e90aed398aea6a8d Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 15 Mar 2017 14:35:34 +0100 Subject: [PATCH 067/106] EXA add examples to illustrate the use of smoothing_noise --- ...plot_smoothing_noise_quantile_transform.py | 97 +++++++++++++++++++ 1 file changed, 97 insertions(+) create mode 100755 examples/preprocessing/plot_smoothing_noise_quantile_transform.py diff --git a/examples/preprocessing/plot_smoothing_noise_quantile_transform.py b/examples/preprocessing/plot_smoothing_noise_quantile_transform.py new file mode 100755 index 0000000000000..0edf2bb58b4ba --- /dev/null +++ b/examples/preprocessing/plot_smoothing_noise_quantile_transform.py @@ -0,0 +1,97 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +"""======================================================== +Effect of smoothing noise when using QuantileTransformer +======================================================== + +This example shows the effect of applying a small noise before +computing the quantiles in the QuantileTransformer. This parameter can +be used if a constant feature value is predominant in the dataset. It +will lead to a difficult interpretation when the quantiles computed +are introspected. + +""" + +# Author: Guillaume Lemaitre +# License: BSD 3 clause + +import numpy as np +import matplotlib.pyplot as plt + +from sklearn.preprocessing import QuantileTransformer + +print(__doc__) + +N_QUANTILES = 1000 +FEAT_VAL = 3.0 + + +def plot_transform_feat_val(ax, transformer, title): + """Plot the full transformation mapping the feature values as well as + a single feature.""" + ref = np.linspace(0, 1, num=N_QUANTILES) + + ax.plot(transformer.quantiles_, ref) + ax.scatter(FEAT_VAL, transformer.transform(FEAT_VAL), c='r', + label=r'$f({0}) = {1:.2f}$'.format( + FEAT_VAL, + np.ravel(transformer.transform(FEAT_VAL))[0])) + ax.set_xlabel('Feature values') + ax.set_ylabel('Quantiles in %') + ax.set_title(title) + ax.legend(loc=4) + # make nice axis layout + ax.spines['top'].set_visible(False) + ax.spines['right'].set_visible(False) + ax.get_xaxis().tick_bottom() + ax.get_yaxis().tick_left() + ax.set_xlim([1, 5.1]) + ax.set_ylim([0, 1]) + ax.spines['left'].set_position(('outward', 10)) + ax.spines['bottom'].set_position(('outward', 10)) + + +############################################################################### +# We can create a synthetic dataset which could represent the +# customers' ratings for restaurant. The scale used is ranging from 1 +# to 5 and a large number of customers attributed a grade of 3 to the +# current restaurant. + +X = np.array([1] * 1000 + + [2] * 2000 + + [3] * 7000 + + [4] * 2000 + + [5] * 1000).reshape(-1, 1) + +# create the subplots +_, (ax1, ax2) = plt.subplots(1, 2) + +############################################################################### +# By default, the ``QuantileTransformer`` does not apply any smoothing +# noise. Dealing with dataset with predominant values, the quantile +# return for such value will correspond to the largest quantile +# computed. In practise, marchine learning algorithms will usually not +# be affected by such characteristics. However, manual interpretation +# might be counter intuitive. + +qt = QuantileTransformer(n_quantiles=N_QUANTILES) +qt.fit(X) +plot_transform_feat_val(ax1, qt, 'Mapping without using smoothing noise') + +############################################################################### +# From the above plot, we would expect that a vote corresponding to +# the value 3 would be mapped to the median (e.g., 0.5). However, the +# default behaviour of the 'interp' numpy function will map this +# feature value to the greater quantile as show by the marker in the +# figure. +# +# A solution is to applied a small smoothing noise before the +# computation of the quantiles. The parameter 'smoothing_noise' offers +# this possibility as illustrated below. + +qt = QuantileTransformer(n_quantiles=N_QUANTILES, + smoothing_noise=1e-7) +qt.fit(X) +plot_transform_feat_val(ax2, qt, 'Mapping using smoothing noise') + +plt.show() From 29c24e08afabf01e833c8325628bda4ac5e13606 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 15 Mar 2017 14:54:56 +0100 Subject: [PATCH 068/106] FIX/DOC fix some grammar --- ...plot_smoothing_noise_quantile_transform.py | 31 ++++++++++--------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/examples/preprocessing/plot_smoothing_noise_quantile_transform.py b/examples/preprocessing/plot_smoothing_noise_quantile_transform.py index 0edf2bb58b4ba..b0461d67c5e39 100755 --- a/examples/preprocessing/plot_smoothing_noise_quantile_transform.py +++ b/examples/preprocessing/plot_smoothing_noise_quantile_transform.py @@ -5,10 +5,10 @@ ======================================================== This example shows the effect of applying a small noise before -computing the quantiles in the QuantileTransformer. This parameter can -be used if a constant feature value is predominant in the dataset. It -will lead to a difficult interpretation when the quantiles computed -are introspected. +computing the quantiles in the QuantileTransformer. The parameter +``smoothing_noise`` can be used if a constant feature value is +predominant in the dataset. It will lead to a difficult interpretation +when the quantiles computed are manually checked. """ @@ -52,10 +52,10 @@ def plot_transform_feat_val(ax, transformer, title): ############################################################################### -# We can create a synthetic dataset which could represent the -# customers' ratings for restaurant. The scale used is ranging from 1 -# to 5 and a large number of customers attributed a grade of 3 to the -# current restaurant. +# We can create a synthetic dataset representing the customers' +# ratings for a restaurant. The scale used is ranging from 1 to 5 and +# a large number of customers attributed a grade of 3 to the current +# restaurant. X = np.array([1] * 1000 + [2] * 2000 + @@ -68,11 +68,11 @@ def plot_transform_feat_val(ax, transformer, title): ############################################################################### # By default, the ``QuantileTransformer`` does not apply any smoothing -# noise. Dealing with dataset with predominant values, the quantile -# return for such value will correspond to the largest quantile -# computed. In practise, marchine learning algorithms will usually not -# be affected by such characteristics. However, manual interpretation -# might be counter intuitive. +# noise. Dealing with dataset with a predominant value, the quantile +# computed for such value will correspond to the largest quantiled. In +# practise, marchine learning algorithms will usually not be affected +# by such characteristics. However, manual interpretation might be +# counter intuitive. qt = QuantileTransformer(n_quantiles=N_QUANTILES) qt.fit(X) @@ -85,7 +85,7 @@ def plot_transform_feat_val(ax, transformer, title): # feature value to the greater quantile as show by the marker in the # figure. # -# A solution is to applied a small smoothing noise before the +# A solution is to apply a small smoothing noise before the # computation of the quantiles. The parameter 'smoothing_noise' offers # this possibility as illustrated below. @@ -94,4 +94,7 @@ def plot_transform_feat_val(ax, transformer, title): qt.fit(X) plot_transform_feat_val(ax2, qt, 'Mapping using smoothing noise') +############################################################################### +# In this last case, the marker is centered at the median as expected. + plt.show() From 3023a2f3d31341d4fa405daeb8bde7a1176544ff Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 15 Mar 2017 16:12:54 +0100 Subject: [PATCH 069/106] DOC fix example --- ...plot_smoothing_noise_quantile_transform.py | 29 +++++++++---------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/examples/preprocessing/plot_smoothing_noise_quantile_transform.py b/examples/preprocessing/plot_smoothing_noise_quantile_transform.py index b0461d67c5e39..ec3223615abed 100755 --- a/examples/preprocessing/plot_smoothing_noise_quantile_transform.py +++ b/examples/preprocessing/plot_smoothing_noise_quantile_transform.py @@ -66,6 +66,15 @@ def plot_transform_feat_val(ax, transformer, title): # create the subplots _, (ax1, ax2) = plt.subplots(1, 2) +qt = QuantileTransformer(n_quantiles=N_QUANTILES) +qt.fit(X) +plot_transform_feat_val(ax1, qt, 'Mapping without using smoothing noise') + +qt = QuantileTransformer(n_quantiles=N_QUANTILES, + smoothing_noise=1e-7) +qt.fit(X) +plot_transform_feat_val(ax2, qt, 'Mapping using smoothing noise') + ############################################################################### # By default, the ``QuantileTransformer`` does not apply any smoothing # noise. Dealing with dataset with a predominant value, the quantile @@ -73,12 +82,7 @@ def plot_transform_feat_val(ax, transformer, title): # practise, marchine learning algorithms will usually not be affected # by such characteristics. However, manual interpretation might be # counter intuitive. - -qt = QuantileTransformer(n_quantiles=N_QUANTILES) -qt.fit(X) -plot_transform_feat_val(ax1, qt, 'Mapping without using smoothing noise') - -############################################################################### +# # From the above plot, we would expect that a vote corresponding to # the value 3 would be mapped to the median (e.g., 0.5). However, the # default behaviour of the 'interp' numpy function will map this @@ -86,15 +90,8 @@ def plot_transform_feat_val(ax, transformer, title): # figure. # # A solution is to apply a small smoothing noise before the -# computation of the quantiles. The parameter 'smoothing_noise' offers -# this possibility as illustrated below. - -qt = QuantileTransformer(n_quantiles=N_QUANTILES, - smoothing_noise=1e-7) -qt.fit(X) -plot_transform_feat_val(ax2, qt, 'Mapping using smoothing noise') - -############################################################################### -# In this last case, the marker is centered at the median as expected. +# computation of the quantiles. The parameter ``smoothing_noise`` offers +# this possibility as illustrated above. +# In this case, the marker is centered at the median as expected. plt.show() From 17db1ff38e320cca3bcc906e4bae88cf0b343fb4 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 15 Mar 2017 16:57:01 +0100 Subject: [PATCH 070/106] DOC/EXA make plot titles more succint --- .../preprocessing/plot_smoothing_noise_quantile_transform.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/examples/preprocessing/plot_smoothing_noise_quantile_transform.py b/examples/preprocessing/plot_smoothing_noise_quantile_transform.py index ec3223615abed..9c5ff8e19bc35 100755 --- a/examples/preprocessing/plot_smoothing_noise_quantile_transform.py +++ b/examples/preprocessing/plot_smoothing_noise_quantile_transform.py @@ -68,12 +68,13 @@ def plot_transform_feat_val(ax, transformer, title): qt = QuantileTransformer(n_quantiles=N_QUANTILES) qt.fit(X) -plot_transform_feat_val(ax1, qt, 'Mapping without using smoothing noise') +plot_transform_feat_val(ax1, qt, 'Without smoothing') qt = QuantileTransformer(n_quantiles=N_QUANTILES, smoothing_noise=1e-7) qt.fit(X) -plot_transform_feat_val(ax2, qt, 'Mapping using smoothing noise') +plot_transform_feat_val(ax2, qt, 'With smoothing') +plt.tight_layout() ############################################################################### # By default, the ``QuantileTransformer`` does not apply any smoothing From 1de03ba5db8e59cc5de113ba43bc65b621aa9ced Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 15 Mar 2017 19:53:52 +0100 Subject: [PATCH 071/106] EXA improve explanation --- examples/preprocessing/plot_all_scaling.py | 45 +++++++++++++++++----- 1 file changed, 36 insertions(+), 9 deletions(-) diff --git a/examples/preprocessing/plot_all_scaling.py b/examples/preprocessing/plot_all_scaling.py index ef354ffeb63c4..37170714dffdb 100755 --- a/examples/preprocessing/plot_all_scaling.py +++ b/examples/preprocessing/plot_all_scaling.py @@ -1,20 +1,23 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -""" -============================================================= +"""============================================================= Compare the effect of different scalers on data with outliers ============================================================= -The feature 0 and feature 5 of california housing dataset contains large -outliers that can make visualization of the data difficult. +The feature 0 and feature 5 of california housing dataset are outside +of the typical range [0, 1] and contain large outliers. These two +characteristics lead to difficulties to visualize the data and, more +importantly, they can degrade the fitting procedure of most of machine +learning algorithms. + +Indeed, data spread in the standard range [0, 1] is a requirement for +a large number of machine learning algorithms such as metrics-based +algorithms or algorithms using gradient-based optimization. -Also linear models like :class:`sklearn.linear_model.SVM` require data which is -approximately normalized to the [-1, 1] or [0, 1] range, or at the very least -have all the features on the same scale. +This example uses different scalers, transformers and normalizers to +bring the data within a smaller range. -This example uses different scalers and normalizers to bring the data within a -smaller range. """ # Author: Raghav RV @@ -148,4 +151,28 @@ def plot_distribution(axes, X, y, hist_nbins=50, plot_title="", size=(15, 10), mpl.colorbar.ColorbarBase(heatmap_legend_ax, cmap=cm.plasma_r, norm=norm, orientation='horizontal', label='Color mapping for values of y') + +############################################################################### +# The different scalers applied a linear transformation to the +# data. The main between these scalers lie in the fact that they are +# using a subset of data to apply this linear +# scaling. ``MinMaxScaler`` will take the full data range while +# ``RobustScaler`` will use a subset of data by discarding data +# outside of certain percentiles. The ``MaxAbsScaler`` will use the +# maximum absolute value while the ``StandardScaler`` will use the +# mean and stardard deviation to scale the data. +# +# The ``QuantileTransformer`` will shrink the distance between the +# outliers and inliers, since this transformation is +# non-linear. Consequently, comparison between features is made easier +# while the potential discriminative power of outliers is +# discarded. This is an important to consider if the aim of the +# application is to detect those outliers. Additionally, this +# transform can map the data to either a uniform or a normal +# distribution. +# +# Unlike the scalers and the transformers, the `Normalizer` applied a +# transformation per samples instead of per features. +# + plt.show() From 79f6e97a5450680b97eb52cd5074a74c274d8ad9 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 15 Mar 2017 20:41:52 +0100 Subject: [PATCH 072/106] EXA improve the docstring --- examples/preprocessing/plot_all_scaling.py | 34 +++++-------------- ...plot_smoothing_noise_quantile_transform.py | 3 +- 2 files changed, 11 insertions(+), 26 deletions(-) diff --git a/examples/preprocessing/plot_all_scaling.py b/examples/preprocessing/plot_all_scaling.py index 37170714dffdb..f330b89f29238 100755 --- a/examples/preprocessing/plot_all_scaling.py +++ b/examples/preprocessing/plot_all_scaling.py @@ -1,7 +1,8 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -"""============================================================= +""" +============================================================= Compare the effect of different scalers on data with outliers ============================================================= @@ -16,7 +17,13 @@ algorithms or algorithms using gradient-based optimization. This example uses different scalers, transformers and normalizers to -bring the data within a smaller range. +bring the data within a smaller range. Scalers are linear +transformations and differ from each other depending of the subset of +data which is considered to define the estimate using during +scaling. ``QuantileTransformer`` provides a non-linear transformation +in which distances between marginal outliers and inliers are +shrunk. Unlinke the previous transformation, normalization refers to a +per sample transformation instead of a per feature transformation. """ @@ -152,27 +159,4 @@ def plot_distribution(axes, X, y, hist_nbins=50, plot_title="", size=(15, 10), norm=norm, orientation='horizontal', label='Color mapping for values of y') -############################################################################### -# The different scalers applied a linear transformation to the -# data. The main between these scalers lie in the fact that they are -# using a subset of data to apply this linear -# scaling. ``MinMaxScaler`` will take the full data range while -# ``RobustScaler`` will use a subset of data by discarding data -# outside of certain percentiles. The ``MaxAbsScaler`` will use the -# maximum absolute value while the ``StandardScaler`` will use the -# mean and stardard deviation to scale the data. -# -# The ``QuantileTransformer`` will shrink the distance between the -# outliers and inliers, since this transformation is -# non-linear. Consequently, comparison between features is made easier -# while the potential discriminative power of outliers is -# discarded. This is an important to consider if the aim of the -# application is to detect those outliers. Additionally, this -# transform can map the data to either a uniform or a normal -# distribution. -# -# Unlike the scalers and the transformers, the `Normalizer` applied a -# transformation per samples instead of per features. -# - plt.show() diff --git a/examples/preprocessing/plot_smoothing_noise_quantile_transform.py b/examples/preprocessing/plot_smoothing_noise_quantile_transform.py index 9c5ff8e19bc35..918cd0fca2b15 100755 --- a/examples/preprocessing/plot_smoothing_noise_quantile_transform.py +++ b/examples/preprocessing/plot_smoothing_noise_quantile_transform.py @@ -1,6 +1,7 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -"""======================================================== +""" +======================================================== Effect of smoothing noise when using QuantileTransformer ======================================================== From 12a3f470d6c00bc3aaaf0d7b29b6fb95da8939e5 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 15 Mar 2017 20:48:50 +0100 Subject: [PATCH 073/106] DOC add a bit more documentation --- doc/modules/preprocessing.rst | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index ea06c5d6b0b16..6229c04eaeac0 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -253,10 +253,13 @@ defined by :math:`phi` followed by removal of the mean in that space. Non-linear transformation ========================= -In the contrary of scaling, data can be non-linearly transformed to reduce the -influence of marginal outliers present in a dataset. Additionally, reducing the -influence of those outliers allows for a more direct comparison between -features, at the cost of distorting correlations between them. +In the contrary of scaling, data can be non-linearly transformed to +reduce the influence of marginal outliers present in a +dataset. Additionally, reducing the influence of those outliers allows +for a more direct comparison between features, at the cost of +distorting correlations between them. Be aware that if the final aim +is to predict such outliers, this transformation completely inhibit the +discriminative power of those samples. :class:`QuantileTransformer` and :func:`quantile_transform` provide a non-parametric transformation based the quantile function to map the data to a From 9226f739a4a492de9bfe7a2d4e583b0ede7cc396 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 6 Apr 2017 00:52:00 +0200 Subject: [PATCH 074/106] FIX advance review --- doc/modules/preprocessing.rst | 27 ++-- examples/preprocessing/plot_all_scaling.py | 2 +- sklearn/preprocessing/data.py | 158 ++++++++++++--------- 3 files changed, 102 insertions(+), 85 deletions(-) diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index 6229c04eaeac0..502b298141490 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -253,13 +253,11 @@ defined by :math:`phi` followed by removal of the mean in that space. Non-linear transformation ========================= -In the contrary of scaling, data can be non-linearly transformed to -reduce the influence of marginal outliers present in a -dataset. Additionally, reducing the influence of those outliers allows -for a more direct comparison between features, at the cost of -distorting correlations between them. Be aware that if the final aim -is to predict such outliers, this transformation completely inhibit the -discriminative power of those samples. +Like scalers, :class:`QuantileTransformer` puts each feature into the same +range or distribution. However, by performing a rank transformation, it smooths +out unusual distributions and is less influenced by outliers than scaling +methods. It does, however, distort correlations and distances within and across +features. :class:`QuantileTransformer` and :func:`quantile_transform` provide a non-parametric transformation based the quantile function to map the data to a @@ -272,18 +270,13 @@ uniform distribution:: >>> X_trans = quantile_transformer.fit_transform(iris.data) It is also possible to map the transformed data to a normal distribution by -setting ``output_distribution='norm'``:: +setting ``output_distribution='normal'``:: - >>> X_trans = preprocessing.quantile_transform(X, output_distribution='norm') + >>> X_trans = preprocessing.quantile_transform(X, output_distribution='normal') -.. topic:: Sparse input - - :class:`QuantileTransformer` and :func:`quantile_transform` accept **both - dense array-like and sparse matrices from scipy.sparse as input**. - - For sparse input the data is **converted to the Compressed Sparse Columns - representation** (see ``scipy.sparse.csc_matrix``). To avoid unnecessary - memory copies, it is recommended to choose the CSC representation upstream. +Thus the median of the input becomes the mean of the output, centered at 0. The +normal output is clipped so that the input's maximum and minimum do not become +infinite under the transformation. .. _preprocessing_normalization: diff --git a/examples/preprocessing/plot_all_scaling.py b/examples/preprocessing/plot_all_scaling.py index f330b89f29238..41d76447d0fa6 100755 --- a/examples/preprocessing/plot_all_scaling.py +++ b/examples/preprocessing/plot_all_scaling.py @@ -78,7 +78,7 @@ QuantileTransformer(output_distribution='uniform') .fit_transform(X)), ('Data after quantile transformation (gaussian pdf)', - QuantileTransformer(output_distribution='norm') + QuantileTransformer(output_distribution='normal') .fit_transform(X)))) y = minmax_scale(y_full) # To make colors corresponding to the target), diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index cc0a61f4d333e..5b07750b2f6bc 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -1915,7 +1915,7 @@ def transform(self, X): class QuantileTransformer(BaseEstimator, TransformerMixin): """Transform features using quantiles information. - This method scales the features to follow a uniform or a normal + This method transforms the features to follow a uniform or a normal distribution. Therefore, for a given feature, this transformation tends to spread out the most frequent values. It also reduces the impact of (marginal) outliers: this is therefore a robust preprocessing scheme. @@ -1938,32 +1938,42 @@ class QuantileTransformer(BaseEstimator, TransformerMixin): output_distribution : str, optional (default='uniform') Marginal distribution for the transformed data. The choices are - 'uniform' (default) or 'norm'. + 'uniform' (default) or 'normal'. ignore_implicit_zeros : bool, optional (default=False) - Apply only for sparse matrices. If True, the sparse entries of the + Only applies to sparse matrices. If True, the sparse entries of the matrix are discarded to compute the quantile statistics. If false, - these entries are accounting for zeros. + these entries are treated as zeros. subsample : int, optional (default=1e5) - Maximum number of samples used to estimate the quantiles. + Maximum number of samples used to estimate the quantiles for + computational efficiency. - smoothing_noise : None or float, optional (default=None) - Standard deviation of the added noise before computing the - quantiles. This parameter is useful if there is a predominant - feature value predominant. + smoothing_noise : float, optional + Perturbs features at training time before computing quantiles by adding + Gaussian noise with standard deviation ``smoothing_noise``. It eases + the interpratation of the computed ``quantiles_`` when a particular + feature value is predominant. random_state : int, RandomState instance or None, optional (default=None) If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used - by np.random. + by np.random. Note that this is used by subsampling and smoothing + noise. + + copy : boolean, optional, (default=True) + Set to False to perform inplace scaling and avoid a copy (if the input + is already a numpy array). Attributes ---------- - quantiles_ : ndarray, shape (n_quantiles_, n_features) + quantiles_ : ndarray, shape (n_quantiles, n_features) The values corresponding the quantiles of reference. + references_ : ndarray, shape(n_quantiles, ) + Quantiles of references. + Examples -------- >>> from sklearn.datasets import load_iris @@ -1980,27 +1990,27 @@ class QuantileTransformer(BaseEstimator, TransformerMixin): RobustScaler : perform robust standardization that removes the influence of outliers but does not put outliers and inliers on the same scale. + """ def __init__(self, n_quantiles=1000, output_distribution='uniform', ignore_implicit_zeros=False, subsample=int(1e5), - smoothing_noise=None, random_state=None): + smoothing_noise=None, random_state=None, copy=True): self.n_quantiles = n_quantiles self.output_distribution = output_distribution self.ignore_implicit_zeros = ignore_implicit_zeros self.subsample = subsample self.smoothing_noise = smoothing_noise self.random_state = random_state + self.copy = copy def _compute_quantile_one_column(self, X_col, references, random_state): """Private function to compute the quantiles for one features.""" - if self.smoothing_noise is None: - noise = np.zeros(X_col.shape) - else: - noise = random_state.normal(0, self.smoothing_noise, - size=X_col.shape) + if self.smoothing_noise is not None: + X_col += random_state.normal(0, self.smoothing_noise, + size=X_col.shape) - return np.percentile(X_col + noise, references) + return np.percentile(X_col, references) def _dense_fit(self, X, random_state): """Compute percentiles for dense matrices. @@ -2014,22 +2024,24 @@ def _dense_fit(self, X, random_state): warnings.warn("'ignore_implicit_zeros' takes effect only with" " sparse matrix. This parameter has no effect.") - # subsample the matrix X if necessary n_samples, n_features = X.shape - if self.subsample < n_samples: - subsample_idx = choice(range(n_samples), size=self.subsample, - replace=False, random_state=random_state) - else: - subsample_idx = slice(None) - # for compatibility issue with numpy<=1.8.X, references # need to be a list scaled between 0 and 100 - references = np.linspace(0, 100, self.n_quantiles, - endpoint=True).tolist() - self.quantiles_ = np.transpose( - [self._compute_quantile_one_column(X[subsample_idx, feature_idx], - references, random_state) - for feature_idx in range(n_features)]) + references = list(map(lambda x: x * 100, self.references_)) + self.quantiles_ = [] + for col in X.T: + if self.subsample < n_samples: + subsample_idx = choice(n_samples, size=self.subsample, + replace=False, + random_state=random_state) + else: + subsample_idx = range(n_samples) + + self.quantiles_.append( + self._compute_quantile_one_column(col.take(subsample_idx, + mode='clip'), + references, random_state)) + self.quantiles_ = np.transpose(self.quantiles_) def _sparse_fit(self, X, random_state): """Compute percentiles for sparse matrices. @@ -2038,14 +2050,13 @@ def _sparse_fit(self, X, random_state): ---------- X : sparse matrix CSC, shape (n_samples, n_features) The data used to scale along the features axis. The sparse matrix - needs to be semi-positive. + needs to be nonnegative. """ n_samples, n_features = X.shape # for compatibility issue with numpy<=1.8.X, references - # need to be a list - references = np.linspace(0, 100, self.n_quantiles, - endpoint=True).tolist() + # need to be a list scaled between 0 and 100 + references = list(map(lambda x: x * 100, self.references_)) self.quantiles_ = [] for feature_idx in range(n_features): column_nnz_data = X.data[X.indptr[feature_idx]: @@ -2088,7 +2099,7 @@ def fit(self, X, y=None): The data used to scale along the features axis. If a sparse matrix is provided, it will be converted into a sparse ``csc_matrix``. Additionally, the sparse matrix needs to be - semi-positive if `ignore_implicit_zeros` is False. + nonnegative if `ignore_implicit_zeros` is False. Returns ------- @@ -2121,6 +2132,9 @@ def fit(self, X, y=None): raise ValueError('QuantileTransformer only accepts non-negative' ' sparse matrices') + # Create the quantiles of reference + self.references_ = np.linspace(0, 1, self.n_quantiles, + endpoint=True) if sparse.issparse(X): self._sparse_fit(X, rng) else: @@ -2128,22 +2142,27 @@ def fit(self, X, y=None): return self - def _transform_col(self, X_col, feature_idx, inverse): + def _transform_col(self, X_col, quantiles, inverse): """Private function to transform a single feature""" - output_distribution = getattr(stats, self.output_distribution) + if self.output_distribution == 'normal': + output_distribution = 'norm' + else: + output_distribution = self.output_distribution + output_distribution = getattr(stats, output_distribution) + # older version of scipy do not handle tuple as fill_value # clipping the value before transform solve the issue if not inverse: - lower_bound_x = self.quantiles_[0, feature_idx] - upper_bound_x = self.quantiles_[-1, feature_idx] + lower_bound_x = quantiles[0] + upper_bound_x = quantiles[-1] lower_bound_y = 0 upper_bound_y = 1 else: lower_bound_x = 0 upper_bound_x = 1 - lower_bound_y = self.quantiles_[0, feature_idx] - upper_bound_y = self.quantiles_[-1, feature_idx] + lower_bound_y = quantiles[0] + upper_bound_y = quantiles[-1] # for inverse transform, match a uniform PDF X_col = output_distribution.cdf(X_col) # find index for lower and higher bounds @@ -2152,13 +2171,10 @@ def _transform_col(self, X_col, feature_idx, inverse): upper_bounds_idx = (X_col + BOUNDS_THRESHOLD > upper_bound_x) - references = np.linspace(0, 1, self.n_quantiles, endpoint=True) if not inverse: - X_col = np.interp(X_col, self.quantiles_[:, feature_idx], - references) + X_col = np.interp(X_col, quantiles, self.references_) else: - X_col = np.interp(X_col, references, - self.quantiles_[:, feature_idx]) + X_col = np.interp(X_col, self.references_, quantiles) X_col[upper_bounds_idx] = upper_bound_y X_col[lower_bounds_idx] = lower_bound_y @@ -2190,12 +2206,12 @@ def _dense_transform(self, X, inverse=False): Returns ------- - X : ndarray, shape (n_samples, n_features) - Projected data """ - + X : ndarray, shape (n_samples, n_features) + Projected data + """ for feature_idx in range(X.shape[1]): - X[:, feature_idx] = self._transform_col(X[:, feature_idx], - feature_idx, inverse) + X[:, feature_idx] = self._transform_col( + X[:, feature_idx], self.quantiles_[:, feature_idx], inverse) return X @@ -2206,7 +2222,7 @@ def _sparse_transform(self, X, inverse=False): ---------- X : sparse matrix CSC, shape (n_samples, n_features) The data used to scale along the features axis. The sparse matrix - needs to be semi-positive. + needs to be nonnegative. inverse : bool, optional (default=False) If False, apply forward transform. If True, apply @@ -2221,14 +2237,14 @@ def _sparse_transform(self, X, inverse=False): for feature_idx in range(X.shape[1]): column_slice = slice(X.indptr[feature_idx], X.indptr[feature_idx + 1]) - X.data[column_slice] = self._transform_col(X.data[column_slice], - feature_idx, inverse) + X.data[column_slice] = self._transform_col( + X.data[column_slice], self.quantiles_[:, feature_idx], inverse) return X def _check_inputs_transform(self, X): """Private function to check the inputs before transforming""" - X = check_array(X, accept_sparse='csc', copy=True, + X = check_array(X, accept_sparse='csc', copy=self.copy, dtype=[np.float64, np.float32]) # we only accept positive sparse matrix when ignore_implicit_zeros is # false @@ -2244,7 +2260,7 @@ def _check_inputs_transform(self, X): ' {}'.format(X.shape[1], self.quantiles_.shape[1])) # check the output PDF - if self.output_distribution not in ('norm', 'uniform'): + if self.output_distribution not in ('normal', 'uniform'): raise ValueError("'output_distribution' has to be either 'norm' or" " 'uniform'. Got {} instead.".format( self.output_distribution)) @@ -2260,7 +2276,7 @@ def transform(self, X): The data used to scale along the features axis. If a sparse matrix is provided, it will be converted into a sparse ``csc_matrix``. Additionally, the sparse matrix needs to be - semi-positive if `ignore_implicit_zeros` is False. + nonnegative if `ignore_implicit_zeros` is False. Returns ------- @@ -2281,7 +2297,7 @@ def inverse_transform(self, X): The data used to scale along the features axis. If a sparse matrix is provided, it will be converted into a sparse ``csc_matrix``. Additionally, the sparse matrix needs to be - semi-positive if `ignore_implicit_zeros` is False. + nonnegative if `ignore_implicit_zeros` is False. Returns ------- @@ -2301,10 +2317,11 @@ def quantile_transform(X, axis=0, n_quantiles=1000, ignore_implicit_zeros=False, subsample=int(1e5), smoothing_noise=None, - random_state=None): + random_state=None, + copy=False): """Transform features using quantiles information. - This method scales the features to follow a uniform or a normal + This method transforms the features to follow a uniform or a normal distribution. Therefore, for a given feature, this transformation tends to spread out the most frequent values. It also reduces the impact of (marginal) outliers: this is therefore a robust preprocessing scheme. @@ -2325,7 +2342,7 @@ def quantile_transform(X, axis=0, n_quantiles=1000, The data used to scale along the features axis. If a sparse matrix is provided, it will be converted into a sparse ``csc_matrix``. Additionally, the sparse matrix needs to be - semi-positive if `ignore_implicit_zeros` is False. + nonnegative if `ignore_implicit_zeros` is False. axis : 0 or 1, optional (0 by default) axis used to normalize the data along. If 1, independently normalize @@ -2337,7 +2354,7 @@ def quantile_transform(X, axis=0, n_quantiles=1000, output_distribution : str, optional (default='uniform') Marginal distribution for the transformed data. The choices are - 'uniform' (default) or 'norm'. + 'uniform' (default) or 'normal'. ignore_implicit_zeros : bool, optional (default=False) Apply only for sparse matrices. If True, the sparse entries of the @@ -2347,7 +2364,7 @@ def quantile_transform(X, axis=0, n_quantiles=1000, subsample : int, optional (default=1e5) Maximum number of samples used to estimate the quantiles. - smoothing_noise : None or float, optional (default=None) + smoothing_noise : float, optional Standard deviation of the added noise before computing the quantiles. This parameter is useful if there is a predominant feature value predominant. @@ -2358,9 +2375,13 @@ def quantile_transform(X, axis=0, n_quantiles=1000, If None, the random number generator is the RandomState instance used by np.random. + copy : boolean, optional, (default=True) + Set to False to perform inplace scaling and avoid a copy (if the input + is already a numpy array). + Attributes ---------- - quantiles_ : ndarray, shape (n_quantiles_, n_features) + quantiles_ : ndarray, shape (n_quantiles, n_features) The values corresponding the quantiles of reference. Examples @@ -2382,10 +2403,13 @@ def quantile_transform(X, axis=0, n_quantiles=1000, robust_scale : perform robust standardization that removes the influence of outliers but does not put outliers and inliers on the same scale. """ - n = QuantileTransformer(n_quantiles=n_quantiles, subsample=subsample, + n = QuantileTransformer(n_quantiles=n_quantiles, + output_distribution=output_distribution, + subsample=subsample, ignore_implicit_zeros=ignore_implicit_zeros, smoothing_noise=smoothing_noise, - random_state=random_state) + random_state=random_state, + copy=copy) if axis == 0: return n.fit_transform(X) elif axis == 1: From b47158f194ffac62c1a5d5339d2ce3ba624c0f4e Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 6 Apr 2017 20:00:15 +0200 Subject: [PATCH 075/106] TST add subsampling test --- sklearn/preprocessing/data.py | 4 +- sklearn/preprocessing/tests/test_data.py | 73 ++++++++++++------------ 2 files changed, 40 insertions(+), 37 deletions(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 5b07750b2f6bc..e533ea451e579 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -2007,8 +2007,8 @@ def __init__(self, n_quantiles=1000, output_distribution='uniform', def _compute_quantile_one_column(self, X_col, references, random_state): """Private function to compute the quantiles for one features.""" if self.smoothing_noise is not None: - X_col += random_state.normal(0, self.smoothing_noise, - size=X_col.shape) + X_col = X_col + random_state.normal(0, self.smoothing_noise, + size=X_col.shape) return np.percentile(X_col, references) diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index cbd419b77679f..f17154b59407c 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -3,12 +3,12 @@ # Giorgio Patrini # # License: BSD 3 clause +from __future__ import division import warnings -import pickle import numpy as np import numpy.linalg as la -from scipy import sparse +from scipy import sparse, stats from distutils.version import LooseVersion from sklearn.utils import gen_batches @@ -974,8 +974,8 @@ def test_quantile_transform_dense_toy(): transformer.fit(X) X_trans = transformer.fit_transform(X) - assert_almost_equal(np.min(X_trans, axis=0), 0.) - assert_almost_equal(np.max(X_trans, axis=0), 1.) + X_gt = np.tile(np.linspace(0, 1, num=5), (3, 1)).T + assert_almost_equal(np.sort(X_trans, axis=0), X_gt) X_test = np.array([ [-1, 1, 0], @@ -990,13 +990,39 @@ def test_quantile_transform_dense_toy(): X_trans_inv = transformer.inverse_transform(X_trans) assert_array_almost_equal(X, X_trans_inv) - # test subsampling - # FIXME: there is not comparison for the moment - random_state = 42 - transformer.set_params(**{'subsample': 3, - 'n_quantiles': 2, - 'random_state': random_state}) - X_trans = transformer.fit_transform(X) + +def test_quantile_transform_subsampling(): + # dense support + N = 1000000 + X = np.sort(np.random.sample((N, 1)), axis=0) + # transform by subsampling several time with different random state + ROUND = 5 + inf_norm_arr = [] + for random_state in range(ROUND): + transformer_subsample = QuantileTransformer(random_state=random_state, + n_quantiles=N, + subsample=N // 10) + transformer_subsample.fit(X) + inf_norm = np.max(np.abs(np.linspace(0, 1, N) - + np.ravel(transformer_subsample.quantiles_))) + assert_true(inf_norm < 1e-2) + inf_norm_arr.append(inf_norm) + assert_equal(len(np.unique(inf_norm_arr)), len(inf_norm_arr)) + + # sparse support + X = sparse.random(N, 1, density=.9, format='csc', + random_state=0, data_rvs=stats.uniform().rvs) + inf_norm_arr = [] + for random_state in range(ROUND): + transformer_subsample = QuantileTransformer(random_state=random_state, + n_quantiles=N, + subsample=N // 10) + transformer_subsample.fit(X) + inf_norm = np.max(np.abs(np.linspace(0, 1, N) - + np.ravel(transformer_subsample.quantiles_))) + assert_true(inf_norm < 1e-1) + inf_norm_arr.append(inf_norm) + assert_equal(len(np.unique(inf_norm_arr)), len(inf_norm_arr)) def test_quantile_transform_sparse_toy(): @@ -1025,14 +1051,6 @@ def test_quantile_transform_sparse_toy(): X_trans_inv = transformer_dense.inverse_transform(X_trans) assert_array_almost_equal(X.toarray(), X_trans_inv.toarray()) - # test subsampling - # FIXME: there is not comparison for the moment - random_state = 42 - transformer.set_params(**{'subsample': 3, - 'n_quantiles': 2, - 'random_state': random_state}) - X_trans = transformer.fit_transform(X) - def test_quantile_transform_axis1(): X = np.array([[0, 25, 50, 75, 100], @@ -1044,7 +1062,7 @@ def test_quantile_transform_axis1(): assert_array_almost_equal(X_trans_a0, X_trans_a1.T) -def test_qunatile_transform_bounds(): +def test_quantile_transform_bounds(): X_dense = np.array([[0, 0], [0, 0], [1, 0]]) @@ -1068,21 +1086,6 @@ def test_qunatile_transform_bounds(): assert_array_almost_equal(X_trans, X1) -def test_quantile_transform_pickling(): - transformer = QuantileTransformer(n_quantiles=100) - - transformer_ser = pickle.dumps(transformer, pickle.HIGHEST_PROTOCOL) - transformer2 = pickle.loads(transformer_ser) - assert_false(hasattr(transformer2, 'f_transform_')) - assert_false(hasattr(transformer2, 'f_inverse_transform_')) - - transformer.fit(iris.data) - transformer_ser = pickle.dumps(transformer, pickle.HIGHEST_PROTOCOL) - transformer2 = pickle.loads(transformer_ser) - assert_array_almost_equal(transformer.transform(iris.data), - transformer2.transform(iris.data)) - - def test_quantile_transform_add_noise_subsamples(): # toy examples unique_feature = [0, 0.5, 1] From bd928edb73ad874bd0c57b5cc60c9b4ab37fbe99 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 7 Apr 2017 14:16:03 +0200 Subject: [PATCH 076/106] DOC/TST better example for the docstring --- sklearn/preprocessing/data.py | 86 ++++++++++++++++-------- sklearn/preprocessing/tests/test_data.py | 4 +- 2 files changed, 59 insertions(+), 31 deletions(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index e533ea451e579..7e68448c890ad 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -1976,10 +1976,28 @@ class QuantileTransformer(BaseEstimator, TransformerMixin): Examples -------- - >>> from sklearn.datasets import load_iris + >>> import numpy as np >>> from sklearn.preprocessing import QuantileTransformer - >>> iris = load_iris() - >>> X_trans = QuantileTransformer(n_quantiles=20).fit_transform(iris.data) + >>> RNG = np.random.RandomState(0) + >>> X = np.sort(RNG.normal(loc=0.5, scale=0.25, size=(25, 1)), axis=0) + >>> # these samples follow a Gaussian distribution + >>> print(np.ravel(X)) + [-0.13824745 0.25568053 0.28647607 0.31445874 0.44871043 0.4621607 + 0.47419529 0.53041875 0.53601089 0.57826693 0.58341858 0.6000393 + 0.60264963 0.61096581 0.66340465 0.69025943 0.71610905 0.7375221 + 0.7446845 0.86356838 0.87351977 0.94101309 0.9668895 1.0602233 + 1.06743866] + >>> qt = QuantileTransformer(n_quantiles=10, random_state=0) + >>> Xt = qt.fit_transform(X) + >>> # these samples have been mapped to a uniform distribution + >>> print(np.ravel(Xt)) + [ 9.99999998e-08 9.87187297e-02 1.06436120e-01 1.17546710e-01 + 2.10174366e-01 2.19454446e-01 2.34986656e-01 3.24436417e-01 + 3.33333333e-01 4.13607939e-01 4.23394641e-01 4.62578413e-01 + 4.71122361e-01 4.98342373e-01 5.99865364e-01 6.33903024e-01 + 6.66666667e-01 6.88731010e-01 6.96111249e-01 8.12806989e-01 + 8.21603540e-01 8.81264388e-01 9.05160277e-01 9.93194350e-01 + 9.99999900e-01] See also -------- @@ -2034,13 +2052,10 @@ def _dense_fit(self, X, random_state): subsample_idx = choice(n_samples, size=self.subsample, replace=False, random_state=random_state) - else: - subsample_idx = range(n_samples) - + col = col.take(subsample_idx, mode='clip') self.quantiles_.append( - self._compute_quantile_one_column(col.take(subsample_idx, - mode='clip'), - references, random_state)) + self._compute_quantile_one_column(col, references, + random_state)) self.quantiles_ = np.transpose(self.quantiles_) def _sparse_fit(self, X, random_state): @@ -2338,16 +2353,6 @@ def quantile_transform(X, axis=0, n_quantiles=1000, Parameters ---------- - X : ndarray or sparse matrix, shape (n_samples, n_features) - The data used to scale along the features axis. If a sparse - matrix is provided, it will be converted into a sparse - ``csc_matrix``. Additionally, the sparse matrix needs to be - nonnegative if `ignore_implicit_zeros` is False. - - axis : 0 or 1, optional (0 by default) - axis used to normalize the data along. If 1, independently normalize - each sample, otherwise (if 0) normalize each feature. - n_quantiles : int, optional (default=1000) Number of quantiles to be computed. It corresponds to the number of landmarks used to discretize the cumulative density function. @@ -2357,23 +2362,26 @@ def quantile_transform(X, axis=0, n_quantiles=1000, 'uniform' (default) or 'normal'. ignore_implicit_zeros : bool, optional (default=False) - Apply only for sparse matrices. If True, the sparse entries of the + Only applies to sparse matrices. If True, the sparse entries of the matrix are discarded to compute the quantile statistics. If false, - these entries are accounting for zeros. + these entries are treated as zeros. subsample : int, optional (default=1e5) - Maximum number of samples used to estimate the quantiles. + Maximum number of samples used to estimate the quantiles for + computational efficiency. smoothing_noise : float, optional - Standard deviation of the added noise before computing the - quantiles. This parameter is useful if there is a predominant - feature value predominant. + Perturbs features at training time before computing quantiles by adding + Gaussian noise with standard deviation ``smoothing_noise``. It eases + the interpratation of the computed ``quantiles_`` when a particular + feature value is predominant. random_state : int, RandomState instance or None, optional (default=None) If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used - by np.random. + by np.random. Note that this is used by subsampling and smoothing + noise. copy : boolean, optional, (default=True) Set to False to perform inplace scaling and avoid a copy (if the input @@ -2384,12 +2392,32 @@ def quantile_transform(X, axis=0, n_quantiles=1000, quantiles_ : ndarray, shape (n_quantiles, n_features) The values corresponding the quantiles of reference. + references_ : ndarray, shape(n_quantiles, ) + Quantiles of references. + Examples -------- - >>> from sklearn.datasets import load_iris + >>> import numpy as np >>> from sklearn.preprocessing import quantile_transform - >>> iris = load_iris() - >>> X_trans = quantile_transform(iris.data, n_quantiles=20) + >>> RNG = np.random.RandomState(0) + >>> X = np.sort(RNG.normal(loc=0.5, scale=0.25, size=(25, 1)), axis=0) + >>> # these samples follow a Gaussian distribution + >>> print(np.ravel(X)) + [-0.13824745 0.25568053 0.28647607 0.31445874 0.44871043 0.4621607 + 0.47419529 0.53041875 0.53601089 0.57826693 0.58341858 0.6000393 + 0.60264963 0.61096581 0.66340465 0.69025943 0.71610905 0.7375221 + 0.7446845 0.86356838 0.87351977 0.94101309 0.9668895 1.0602233 + 1.06743866] + >>> Xt = quantile_transform(X, n_quantiles=10, random_state=0) + >>> # these samples have been mapped to a uniform distribution + >>> print(np.ravel(Xt)) + [ 9.99999998e-08 9.87187297e-02 1.06436120e-01 1.17546710e-01 + 2.10174366e-01 2.19454446e-01 2.34986656e-01 3.24436417e-01 + 3.33333333e-01 4.13607939e-01 4.23394641e-01 4.62578413e-01 + 4.71122361e-01 4.98342373e-01 5.99865364e-01 6.33903024e-01 + 6.66666667e-01 6.88731010e-01 6.96111249e-01 8.12806989e-01 + 8.21603540e-01 8.81264388e-01 9.05160277e-01 9.93194350e-01 + 9.99999900e-01] See also -------- diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index f17154b59407c..51a9e202cb04f 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -995,7 +995,6 @@ def test_quantile_transform_subsampling(): # dense support N = 1000000 X = np.sort(np.random.sample((N, 1)), axis=0) - # transform by subsampling several time with different random state ROUND = 5 inf_norm_arr = [] for random_state in range(ROUND): @@ -1010,7 +1009,7 @@ def test_quantile_transform_subsampling(): assert_equal(len(np.unique(inf_norm_arr)), len(inf_norm_arr)) # sparse support - X = sparse.random(N, 1, density=.9, format='csc', + X = sparse.random(N, 1, density=.99, format='csc', random_state=0, data_rvs=stats.uniform().rvs) inf_norm_arr = [] for random_state in range(ROUND): @@ -1020,6 +1019,7 @@ def test_quantile_transform_subsampling(): transformer_subsample.fit(X) inf_norm = np.max(np.abs(np.linspace(0, 1, N) - np.ravel(transformer_subsample.quantiles_))) + assert_true(inf_norm < 1e-1) inf_norm_arr.append(inf_norm) assert_equal(len(np.unique(inf_norm_arr)), len(inf_norm_arr)) From c70aba0345c0827e00f84533dcdb4979b6c02d01 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 7 Apr 2017 15:50:47 +0200 Subject: [PATCH 077/106] DOC add ellipsis to docstring --- sklearn/preprocessing/data.py | 39 +++++------------------------------ 1 file changed, 5 insertions(+), 34 deletions(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 7e68448c890ad..2d7ca011736ab 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -1980,24 +1980,9 @@ class QuantileTransformer(BaseEstimator, TransformerMixin): >>> from sklearn.preprocessing import QuantileTransformer >>> RNG = np.random.RandomState(0) >>> X = np.sort(RNG.normal(loc=0.5, scale=0.25, size=(25, 1)), axis=0) - >>> # these samples follow a Gaussian distribution - >>> print(np.ravel(X)) - [-0.13824745 0.25568053 0.28647607 0.31445874 0.44871043 0.4621607 - 0.47419529 0.53041875 0.53601089 0.57826693 0.58341858 0.6000393 - 0.60264963 0.61096581 0.66340465 0.69025943 0.71610905 0.7375221 - 0.7446845 0.86356838 0.87351977 0.94101309 0.9668895 1.0602233 - 1.06743866] >>> qt = QuantileTransformer(n_quantiles=10, random_state=0) - >>> Xt = qt.fit_transform(X) - >>> # these samples have been mapped to a uniform distribution - >>> print(np.ravel(Xt)) - [ 9.99999998e-08 9.87187297e-02 1.06436120e-01 1.17546710e-01 - 2.10174366e-01 2.19454446e-01 2.34986656e-01 3.24436417e-01 - 3.33333333e-01 4.13607939e-01 4.23394641e-01 4.62578413e-01 - 4.71122361e-01 4.98342373e-01 5.99865364e-01 6.33903024e-01 - 6.66666667e-01 6.88731010e-01 6.96111249e-01 8.12806989e-01 - 8.21603540e-01 8.81264388e-01 9.05160277e-01 9.93194350e-01 - 9.99999900e-01] + >>> qt.fit_transform(X) # doctest: +ELLIPSIS + array([...]) See also -------- @@ -2401,23 +2386,9 @@ def quantile_transform(X, axis=0, n_quantiles=1000, >>> from sklearn.preprocessing import quantile_transform >>> RNG = np.random.RandomState(0) >>> X = np.sort(RNG.normal(loc=0.5, scale=0.25, size=(25, 1)), axis=0) - >>> # these samples follow a Gaussian distribution - >>> print(np.ravel(X)) - [-0.13824745 0.25568053 0.28647607 0.31445874 0.44871043 0.4621607 - 0.47419529 0.53041875 0.53601089 0.57826693 0.58341858 0.6000393 - 0.60264963 0.61096581 0.66340465 0.69025943 0.71610905 0.7375221 - 0.7446845 0.86356838 0.87351977 0.94101309 0.9668895 1.0602233 - 1.06743866] - >>> Xt = quantile_transform(X, n_quantiles=10, random_state=0) - >>> # these samples have been mapped to a uniform distribution - >>> print(np.ravel(Xt)) - [ 9.99999998e-08 9.87187297e-02 1.06436120e-01 1.17546710e-01 - 2.10174366e-01 2.19454446e-01 2.34986656e-01 3.24436417e-01 - 3.33333333e-01 4.13607939e-01 4.23394641e-01 4.62578413e-01 - 4.71122361e-01 4.98342373e-01 5.99865364e-01 6.33903024e-01 - 6.66666667e-01 6.88731010e-01 6.96111249e-01 8.12806989e-01 - 8.21603540e-01 8.81264388e-01 9.05160277e-01 9.93194350e-01 - 9.99999900e-01] + >>> quantile_transform(X, n_quantiles=10, random_state=0) + ... # doctest: +ELLIPSIS + array([...]) See also -------- From 9a9556cc66a5a71acbb777767969c9e875137aae Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Sat, 8 Apr 2017 11:28:11 +0200 Subject: [PATCH 078/106] FIX address olivier comments --- doc/modules/preprocessing.rst | 7 ++- ...plot_smoothing_noise_quantile_transform.py | 23 +++++--- sklearn/preprocessing/tests/test_data.py | 57 ++++++++++--------- 3 files changed, 49 insertions(+), 38 deletions(-) diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index 502b298141490..7e2b3a0862960 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -261,7 +261,7 @@ features. :class:`QuantileTransformer` and :func:`quantile_transform` provide a non-parametric transformation based the quantile function to map the data to a -uniform distribution:: +uniform distribution with value between 0 and 1:: >>> from sklearn.datasets import load_iris >>> iris = load_iris() @@ -275,8 +275,9 @@ setting ``output_distribution='normal'``:: >>> X_trans = preprocessing.quantile_transform(X, output_distribution='normal') Thus the median of the input becomes the mean of the output, centered at 0. The -normal output is clipped so that the input's maximum and minimum do not become -infinite under the transformation. +normal output is clipped so that the input's maximum and minimum --- +corresponding to the 1 - 1e-7 and 1e-7 quantiles, respectively --- do not +become infinite under the transformation. .. _preprocessing_normalization: diff --git a/examples/preprocessing/plot_smoothing_noise_quantile_transform.py b/examples/preprocessing/plot_smoothing_noise_quantile_transform.py index 918cd0fca2b15..ca18e4a0c62c3 100755 --- a/examples/preprocessing/plot_smoothing_noise_quantile_transform.py +++ b/examples/preprocessing/plot_smoothing_noise_quantile_transform.py @@ -5,11 +5,20 @@ Effect of smoothing noise when using QuantileTransformer ======================================================== -This example shows the effect of applying a small noise before -computing the quantiles in the QuantileTransformer. The parameter -``smoothing_noise`` can be used if a constant feature value is -predominant in the dataset. It will lead to a difficult interpretation -when the quantiles computed are manually checked. +The parameter ``smoothing_noise`` can be used if some specific feature values +are repeated exactly many times to the point of being predominant in the +dataset. + +Without smoothing noise, the ``QuantileTransformer`` will map those values to +some arbitrary value: the highest quantile value for all the inputs with the +same value. While this is usually not an issue when ``QuantileTransformer`` is +used as a preprocessing transformer for a subsequent subsequent supervised +estimator, it can lead to surprising results when manually inspecting the +transformed values (e.g. for visualization or reporting). + +The goal of the smoothing noise is to make it possible to map those repeated +values to some middle quantile value to make interpretation more intuitive as +demonstrated in the following. """ @@ -58,8 +67,8 @@ def plot_transform_feat_val(ax, transformer, title): # a large number of customers attributed a grade of 3 to the current # restaurant. -X = np.array([1] * 1000 + - [2] * 2000 + +X = np.array([1] * 2000 + + [2] * 1000 + [3] * 7000 + [4] * 2000 + [5] * 1000).reshape(-1, 1) diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index 51a9e202cb04f..3d274c39c32e2 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -993,35 +993,38 @@ def test_quantile_transform_dense_toy(): def test_quantile_transform_subsampling(): # dense support - N = 1000000 - X = np.sort(np.random.sample((N, 1)), axis=0) + n_samples = 1000000 + X = np.sort(np.random.sample((n_samples, 1)), axis=0) ROUND = 5 inf_norm_arr = [] for random_state in range(ROUND): - transformer_subsample = QuantileTransformer(random_state=random_state, - n_quantiles=N, - subsample=N // 10) - transformer_subsample.fit(X) - inf_norm = np.max(np.abs(np.linspace(0, 1, N) - - np.ravel(transformer_subsample.quantiles_))) + transformer = QuantileTransformer(random_state=random_state, + n_quantiles=n_samples, + subsample=n_samples//10) + transformer.fit(X) + diff = np.linspace(0, 1, n_samples) - np.ravel(transformer.quantiles_) + inf_norm = np.max(np.abs(diff)) assert_true(inf_norm < 1e-2) inf_norm_arr.append(inf_norm) + # each random subsampling yield a unique approximation to the expected + # linspace CDF assert_equal(len(np.unique(inf_norm_arr)), len(inf_norm_arr)) # sparse support - X = sparse.random(N, 1, density=.99, format='csc', - random_state=0, data_rvs=stats.uniform().rvs) + X = sparse.rand(n_samples, 1, density=.99, format='csc', random_state=0) inf_norm_arr = [] for random_state in range(ROUND): - transformer_subsample = QuantileTransformer(random_state=random_state, - n_quantiles=N, - subsample=N // 10) - transformer_subsample.fit(X) - inf_norm = np.max(np.abs(np.linspace(0, 1, N) - - np.ravel(transformer_subsample.quantiles_))) + transformer = QuantileTransformer(random_state=random_state, + n_quantiles=n_samples, + subsample=n_samples//10) + transformer.fit(X) + diff = np.linspace(0, 1, n_samples) - np.ravel(transformer.quantiles_) + inf_norm = np.max(np.abs(diff)) assert_true(inf_norm < 1e-1) inf_norm_arr.append(inf_norm) + # each random subsampling yield a unique approximation to the expected + # linspace CDF assert_equal(len(np.unique(inf_norm_arr)), len(inf_norm_arr)) @@ -1095,9 +1098,8 @@ def test_quantile_transform_add_noise_subsamples(): transformer = QuantileTransformer(n_quantiles=100, smoothing_noise=1e-7, random_state=0) transformer.fit(X) - # check that the feature values associated to quantiles are - # strictly monitically increasing as suggested by the 'interp' - # function from numpy + # check that the feature values associated to quantiles are strictly + # monitically increasing as suggested by the 'interp' function from numpy assert_true(np.all(np.diff(transformer.quantiles_) > 0)) # iris dataset X = iris.data @@ -1113,15 +1115,14 @@ def test_quantile_transform_add_noise_subsamples(): def test_quantile_transform_numpy_interp_behaviour(): - # The quantile transformer relies on the numpy implementation of - # 'interp' function. In the presence of a predominant constant - # feature values or a large number of quantiles, a single feature - # value is mapped to different quantiles. The default behaviour of - # 'interp' will be returning the larger quantile associated to the - # feature value. This test attends to check if there is any - # changes in the 'interp' function and to act accordingly. This - # implementation subtilities is mention in the docstring of the - # 'interp' function. + # The quantile transformer relies on the numpy implementation of 'interp' + # function. In the presence of a predominant constant feature values or a + # large number of quantiles, a single feature value is mapped to different + # quantiles. The default behaviour of 'interp' will be returning the + # largest quantile associated to the feature value. This test attends to + # check if there is any behavorial changes in the 'interp' function and to + # act accordingly. This implementation subtilities is mention in the + # docstring of the 'interp' function. unique_feature = [0, 0.5, 1] X = np.transpose([[unique_feature[0]] * 1 + From 6b105a916c02f160738c097d209231cf7f65fb2b Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Sat, 8 Apr 2017 11:46:13 +0200 Subject: [PATCH 079/106] FIX remove random_state in sparse.rand --- sklearn/preprocessing/tests/test_data.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index 3d274c39c32e2..7da1159ae4d5c 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -8,7 +8,7 @@ import warnings import numpy as np import numpy.linalg as la -from scipy import sparse, stats +from scipy import sparse from distutils.version import LooseVersion from sklearn.utils import gen_batches @@ -1011,7 +1011,7 @@ def test_quantile_transform_subsampling(): assert_equal(len(np.unique(inf_norm_arr)), len(inf_norm_arr)) # sparse support - X = sparse.rand(n_samples, 1, density=.99, format='csc', random_state=0) + X = sparse.rand(n_samples, 1, density=.99, format='csc') inf_norm_arr = [] for random_state in range(ROUND): transformer = QuantileTransformer(random_state=random_state, From dc39f9e37fa27dcfad4b16703b483a3adf5d3ecb Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Sat, 8 Apr 2017 18:16:01 +0200 Subject: [PATCH 080/106] FIX spelling doc --- doc/modules/preprocessing.rst | 2 +- sklearn/preprocessing/tests/test_data.py | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index 7e2b3a0862960..e280d9981f76e 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -261,7 +261,7 @@ features. :class:`QuantileTransformer` and :func:`quantile_transform` provide a non-parametric transformation based the quantile function to map the data to a -uniform distribution with value between 0 and 1:: +uniform distribution with values between 0 and 1:: >>> from sklearn.datasets import load_iris >>> iris = load_iris() diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index 7da1159ae4d5c..fce79293ea61e 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -1011,6 +1011,9 @@ def test_quantile_transform_subsampling(): assert_equal(len(np.unique(inf_norm_arr)), len(inf_norm_arr)) # sparse support + + # TODO: rng should be seeded once we drop support for older versions of + # scipy (< 0.13) that don't support seeding. X = sparse.rand(n_samples, 1, density=.99, format='csc') inf_norm_arr = [] for random_state in range(ROUND): From c3cf631094f6ce46aa24d4f00c3739ccae76b65f Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 11 Apr 2017 21:15:21 +0200 Subject: [PATCH 081/106] FIX cite example in user guide and docstring --- doc/modules/preprocessing.rst | 7 +++++ sklearn/preprocessing/data.py | 59 ++++++++++++++++++++++++++++++++++- 2 files changed, 65 insertions(+), 1 deletion(-) diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index e280d9981f76e..5ade5d1a144e4 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -10,6 +10,9 @@ The ``sklearn.preprocessing`` package provides several common utility functions and transformer classes to change raw feature vectors into a representation that is more suitable for the downstream estimators. +Refer to :ref:`sphx_glr_auto_examples_preprocessing_plot_all_scaling.py` for a +comparison of the different scalers, transformers, and normalizers. + .. _preprocessing_scaler: Standardization, or mean removal and variance scaling @@ -279,6 +282,10 @@ normal output is clipped so that the input's maximum and minimum --- corresponding to the 1 - 1e-7 and 1e-7 quantiles, respectively --- do not become infinite under the transformation. +:class:`QuantileTransformer` provides a ``smoothing_noise`` parameter to make +the interpretation more intuitive when manually checking the transformation. See +:ref:`sphx_glr_auto_examples_preprocessing_plot_smoothing_noise_quantile_transform.py` + .. _preprocessing_normalization: Normalization diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 2d7ca011736ab..0dfd660c2ade2 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -120,10 +120,14 @@ def scale(X, axis=0, with_mean=True, with_std=True, copy=True): To avoid memory copy the caller should pass a CSC matrix. + See examples/preprocessing/plot_all_scaling.py for a comparison of the + different scalers, transformers, and normalizers. + See also -------- StandardScaler: Performs scaling to unit variance using the``Transformer`` API (e.g. as part of a preprocessing :class:`sklearn.pipeline.Pipeline`). + """ # noqa X = check_array(X, accept_sparse='csc', copy=copy, ensure_2d=False, warn_on_dtype=True, estimator='the scale function', @@ -244,6 +248,11 @@ class MinMaxScaler(BaseEstimator, TransformerMixin): See also -------- minmax_scale: Equivalent function without the object oriented API. + + Notes + ----- + See examples/preprocessing/plot_all_scaling.py for a comparison of the + different scalers, transformers, and normalizers. """ def __init__(self, feature_range=(0, 1), copy=True): @@ -400,6 +409,11 @@ def minmax_scale(X, feature_range=(0, 1), axis=0, copy=True): -------- MinMaxScaler: Performs scaling to a given range using the``Transformer`` API (e.g. as part of a preprocessing :class:`sklearn.pipeline.Pipeline`). + + Notes + ----- + See examples/preprocessing/plot_all_scaling.py for a comparison of the + different scalers, transformers, and normalizers. """ # noqa # Unlike the scaler object, this function allows 1d input. # If copy is required, it will be done inside the scaler object. @@ -492,6 +506,11 @@ class StandardScaler(BaseEstimator, TransformerMixin): :class:`sklearn.decomposition.PCA` Further removes the linear correlation across features with 'whiten=True'. + + Notes + ----- + See examples/preprocessing/plot_all_scaling.py for a comparison of the + different scalers, transformers, and normalizers. """ # noqa def __init__(self, copy=True, with_mean=True, with_std=True): @@ -694,6 +713,11 @@ class MaxAbsScaler(BaseEstimator, TransformerMixin): See also -------- maxabs_scale: Equivalent function without the object oriented API. + + Notes + ----- + See examples/preprocessing/plot_all_scaling.py for a comparison of the + different scalers, transformers, and normalizers. """ def __init__(self, copy=True): @@ -821,6 +845,11 @@ def maxabs_scale(X, axis=0, copy=True): -------- MaxAbsScaler: Performs scaling to the [-1, 1] range using the``Transformer`` API (e.g. as part of a preprocessing :class:`sklearn.pipeline.Pipeline`). + + Notes + ----- + See examples/preprocessing/plot_all_scaling.py for a comparison of the + different scalers, transformers, and normalizers. """ # noqa # Unlike the scaler object, this function allows 1d input. @@ -913,7 +942,7 @@ class RobustScaler(BaseEstimator, TransformerMixin): Notes ----- - See examples/preprocessing/plot_robust_scaling.py for an example. + See examples/preprocessing/plot_all_scaling.py for an example. https://en.wikipedia.org/wiki/Median_(statistics) https://en.wikipedia.org/wiki/Interquartile_range @@ -1063,6 +1092,9 @@ def robust_scale(X, axis=0, with_centering=True, with_scaling=True, To avoid memory copy the caller should pass a CSR matrix. + See examples/preprocessing/plot_all_scaling.py for a comparison of the + different scalers, transformers, and normalizers. + See also -------- RobustScaler: Performs centering and scaling using the ``Transformer`` API @@ -1279,6 +1311,11 @@ def normalize(X, norm='l2', axis=1, copy=True, return_norm=False): -------- Normalizer: Performs normalization using the ``Transformer`` API (e.g. as part of a preprocessing :class:`sklearn.pipeline.Pipeline`). + + Notes + ----- + See examples/preprocessing/plot_all_scaling.py for a comparison of the + different scalers, transformers, and normalizers. """ if norm not in ('l1', 'l2', 'max'): raise ValueError("'%s' is not a supported norm" % norm) @@ -1362,6 +1399,9 @@ class Normalizer(BaseEstimator, TransformerMixin): This estimator is stateless (besides constructor parameters), the fit method does nothing but is useful when used in a pipeline. + See examples/preprocessing/plot_all_scaling.py for a comparison of the + different scalers, transformers, and normalizers. + See also -------- normalize: Equivalent function without the object oriented API. @@ -1994,6 +2034,14 @@ class QuantileTransformer(BaseEstimator, TransformerMixin): RobustScaler : perform robust standardization that removes the influence of outliers but does not put outliers and inliers on the same scale. + Notes + ----- + See examples/preprocessing/plot_all_scaling.py for a comparison of the + different scalers, transformers, and normalizers. + + See examples/preprocessing/plot_smoothing_noise_quantile_transform.py for + an illustration of the ``smoothing_noise`` parameter use. + """ def __init__(self, n_quantiles=1000, output_distribution='uniform', @@ -2401,6 +2449,15 @@ def quantile_transform(X, axis=0, n_quantiles=1000, robust_scale : perform robust standardization that removes the influence of outliers but does not put outliers and inliers on the same scale. + + Notes + ----- + See examples/preprocessing/plot_all_scaling.py for a comparison of the + different scalers, transformers, and normalizers. + + See examples/preprocessing/plot_smoothing_noise_quantile_transform.py for + an illustration of the ``smoothing_noise`` parameter use. + """ n = QuantileTransformer(n_quantiles=n_quantiles, output_distribution=output_distribution, From 570c5d08d02ef56697b0a97b79f439d99edd459e Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 12 Apr 2017 14:01:54 +0200 Subject: [PATCH 082/106] FIX olivier comments --- doc/modules/classes.rst | 2 + doc/modules/preprocessing.rst | 9 +++-- examples/preprocessing/plot_all_scaling.py | 39 ++++++++++--------- ...plot_smoothing_noise_quantile_transform.py | 3 +- sklearn/preprocessing/data.py | 16 +++----- sklearn/preprocessing/tests/test_data.py | 17 ++++---- 6 files changed, 45 insertions(+), 41 deletions(-) diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst index 3101488fd6661..92d9b93cf9a6b 100644 --- a/doc/modules/classes.rst +++ b/doc/modules/classes.rst @@ -1194,6 +1194,7 @@ See the :ref:`metrics` section of the user guide for further details. preprocessing.Normalizer preprocessing.OneHotEncoder preprocessing.PolynomialFeatures + preprocessing.QuantileTransformer preprocessing.RobustScaler preprocessing.StandardScaler @@ -1207,6 +1208,7 @@ See the :ref:`metrics` section of the user guide for further details. preprocessing.maxabs_scale preprocessing.minmax_scale preprocessing.normalize + preprocessing.quantile_transform preprocessing.robust_scale preprocessing.scale diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index 5ade5d1a144e4..f780101f0f2d0 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -267,10 +267,13 @@ non-parametric transformation based the quantile function to map the data to a uniform distribution with values between 0 and 1:: >>> from sklearn.datasets import load_iris + >>> from sklearn.model_selection import train_test_split >>> iris = load_iris() >>> X, y = iris.data, iris.target + >>> X_train, X_test, y_train, y_test = train_test_split(X, y) >>> quantile_transformer = preprocessing.QuantileTransformer() - >>> X_trans = quantile_transformer.fit_transform(iris.data) + >>> X_train_trans = quantile_transformer.fit_transform(X_train) + >>> X_test_trans = quantile_transformer.transform(X_test) It is also possible to map the transformed data to a normal distribution by setting ``output_distribution='normal'``:: @@ -278,8 +281,8 @@ setting ``output_distribution='normal'``:: >>> X_trans = preprocessing.quantile_transform(X, output_distribution='normal') Thus the median of the input becomes the mean of the output, centered at 0. The -normal output is clipped so that the input's maximum and minimum --- -corresponding to the 1 - 1e-7 and 1e-7 quantiles, respectively --- do not +normal output is clipped so that the input's minimum and maximum --- +corresponding to the 1e-7 and 1 - 1e-7 quantiles respectively --- do not become infinite under the transformation. :class:`QuantileTransformer` provides a ``smoothing_noise`` parameter to make diff --git a/examples/preprocessing/plot_all_scaling.py b/examples/preprocessing/plot_all_scaling.py index 41d76447d0fa6..9df73dfe71aea 100755 --- a/examples/preprocessing/plot_all_scaling.py +++ b/examples/preprocessing/plot_all_scaling.py @@ -6,24 +6,27 @@ Compare the effect of different scalers on data with outliers ============================================================= -The feature 0 and feature 5 of california housing dataset are outside -of the typical range [0, 1] and contain large outliers. These two -characteristics lead to difficulties to visualize the data and, more -importantly, they can degrade the fitting procedure of most of machine -learning algorithms. - -Indeed, data spread in the standard range [0, 1] is a requirement for -a large number of machine learning algorithms such as metrics-based -algorithms or algorithms using gradient-based optimization. - -This example uses different scalers, transformers and normalizers to -bring the data within a smaller range. Scalers are linear -transformations and differ from each other depending of the subset of -data which is considered to define the estimate using during -scaling. ``QuantileTransformer`` provides a non-linear transformation -in which distances between marginal outliers and inliers are -shrunk. Unlinke the previous transformation, normalization refers to a -per sample transformation instead of a per feature transformation. +The feature 0 and feature 5 of California housing dataset are outside of the +typical range [0, 1] and contain large outliers. These two characteristics lead +to difficulties to visualize the data and, more importantly, they can degrade +the fitting procedure of most of machine learning algorithms. + +Indeed many estimators assume that each feature takes values spread around or +close to zero and more importantly that all features vary on comparable +scales. In particular metric-based and gradient-based estimators often assume +approximately standardized data (centered features with unit variances). A +notable exception are decision tree-based estimators that are robust to +arbitrary scaling of the data. + +This example uses different scalers, transformers and normalizers to bring the +data within a pre-defined range. + +Scalers are linear (or more exactly affine) transformations and differ from +each other in the way to estimate the parameters used to shift and scale each +feature. ``QuantileTransformer`` provides a non-linear transformation in which +distances between marginal outliers and inliers are shrunk. Unlike the +previous transformations, normalization refers to a per sample transformation +instead of a per feature transformation. """ diff --git a/examples/preprocessing/plot_smoothing_noise_quantile_transform.py b/examples/preprocessing/plot_smoothing_noise_quantile_transform.py index ca18e4a0c62c3..f6d6e7b61ac7b 100755 --- a/examples/preprocessing/plot_smoothing_noise_quantile_transform.py +++ b/examples/preprocessing/plot_smoothing_noise_quantile_transform.py @@ -85,6 +85,7 @@ def plot_transform_feat_val(ax, transformer, title): qt.fit(X) plot_transform_feat_val(ax2, qt, 'With smoothing') plt.tight_layout() +plt.show() ############################################################################### # By default, the ``QuantileTransformer`` does not apply any smoothing @@ -104,5 +105,3 @@ def plot_transform_feat_val(ax, transformer, title): # computation of the quantiles. The parameter ``smoothing_noise`` offers # this possibility as illustrated above. # In this case, the marker is centered at the median as expected. - -plt.show() diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 0dfd660c2ade2..af30c2dc5abbd 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -2027,12 +2027,10 @@ class QuantileTransformer(BaseEstimator, TransformerMixin): See also -------- quantile_transform : Equivalent function without the object oriented API. - StandardScaler : perform standardization that is faster, but less robust - to outliers. - + to outliers. RobustScaler : perform robust standardization that removes the influence - of outliers but does not put outliers and inliers on the same scale. + of outliers but does not put outliers and inliers on the same scale. Notes ----- @@ -2441,14 +2439,12 @@ def quantile_transform(X, axis=0, n_quantiles=1000, See also -------- QuantileTransformer : Performs quantile-based scaling using the - ``Transformer`` API (e.g. as part of a preprocessing - :class:`sklearn.pipeline.Pipeline`). - + ``Transformer`` API (e.g. as part of a preprocessing + :class:`sklearn.pipeline.Pipeline`). scale : perform standardization that is faster, but less robust - to outliers. - + to outliers. robust_scale : perform robust standardization that removes the influence - of outliers but does not put outliers and inliers on the same scale. + of outliers but does not put outliers and inliers on the same scale. Notes ----- diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index fce79293ea61e..e1344b1583971 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -19,6 +19,7 @@ from sklearn.utils.testing import assert_array_equal from sklearn.utils.testing import assert_array_less from sklearn.utils.testing import assert_equal +from sklearn.utils.testing import assert_greater from sklearn.utils.testing import assert_greater_equal from sklearn.utils.testing import assert_less_equal from sklearn.utils.testing import assert_raises @@ -1103,7 +1104,8 @@ def test_quantile_transform_add_noise_subsamples(): transformer.fit(X) # check that the feature values associated to quantiles are strictly # monitically increasing as suggested by the 'interp' function from numpy - assert_true(np.all(np.diff(transformer.quantiles_) > 0)) + diff_quantiles = np.diff(transformer.quantiles_, axis=0) + map(assert_greater, diff_quantiles, [0] * len(diff_quantiles)) # iris dataset X = iris.data transformer = QuantileTransformer(n_quantiles=1000, smoothing_noise=1e-7, @@ -1114,7 +1116,9 @@ def test_quantile_transform_add_noise_subsamples(): # check that the feature values associated to quantiles are # strictly monitically increasing as suggested by the 'interp' # function from numpy - assert_true(np.all(np.diff(transformer.quantiles_, axis=0) > 0)) + diff_quantiles = np.diff(transformer.quantiles_, axis=0) + for dq in diff_quantiles.T: + map(assert_greater, dq, [0] * len(dq)) def test_quantile_transform_numpy_interp_behaviour(): @@ -1934,13 +1938,10 @@ def test_fit_cold_start(): scaler.fit_transform(X_2d) -def test_function_valid_axis(): +def test_quantile_transform_valid_axis(): X = np.array([[0, 25, 50, 75, 100], [2, 4, 6, 8, 10], [2.6, 4.1, 2.3, 9.5, 0.1]]) - func_list = [quantile_transform] - - for func in func_list: - assert_raises_regex(ValueError, "axis should be either equal to 0 or 1" - ". Got axis=2", func, X.T, axis=2) + assert_raises_regex(ValueError, "axis should be either equal to 0 or 1" + ". Got axis=2", quantile_transform, X.T, axis=2) From da5604de9c07a8dc102c1adca979237d40a31960 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 13 Apr 2017 17:21:23 +0200 Subject: [PATCH 083/106] EHN improve the example comparing all the pre-processing methods --- doc/modules/preprocessing.rst | 24 +-- examples/preprocessing/plot_all_scaling.py | 179 ++++++++++++++++----- 2 files changed, 154 insertions(+), 49 deletions(-) diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index f780101f0f2d0..89dbfc0832f41 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -10,8 +10,13 @@ The ``sklearn.preprocessing`` package provides several common utility functions and transformer classes to change raw feature vectors into a representation that is more suitable for the downstream estimators. -Refer to :ref:`sphx_glr_auto_examples_preprocessing_plot_all_scaling.py` for a -comparison of the different scalers, transformers, and normalizers. +In general, learning algorithm benefit from standardization of the data set. If +some outliers are present in the set, robust scalers or transformers are more +appropriate. The behaviors of the different scalers, transformers, and +normalizers on a dataset containing marginal outliers is highlighted in +ref:`sphx_glr_auto_examples_preprocessing_plot_all_scaling.py`. In the +following, a description of these methods is given. + .. _preprocessing_scaler: @@ -42,10 +47,10 @@ operation on a single array-like dataset:: >>> from sklearn import preprocessing >>> import numpy as np - >>> X = np.array([[ 1., -1., 2.], - ... [ 2., 0., 0.], - ... [ 0., 1., -1.]]) - >>> X_scaled = preprocessing.scale(X) + >>> X_train = np.array([[ 1., -1., 2.], + ... [ 2., 0., 0.], + ... [ 0., 1., -1.]]) + >>> X_scaled = preprocessing.scale(X_train) >>> X_scaled # doctest: +ELLIPSIS array([[ 0. ..., -1.22..., 1.33...], @@ -74,7 +79,7 @@ able to later reapply the same transformation on the testing set. This class is hence suitable for use in the early steps of a :class:`sklearn.pipeline.Pipeline`:: - >>> scaler = preprocessing.StandardScaler().fit(X) + >>> scaler = preprocessing.StandardScaler().fit(X_train) >>> scaler StandardScaler(copy=True, with_mean=True, with_std=True) @@ -84,7 +89,7 @@ This class is hence suitable for use in the early steps of a >>> scaler.scale_ # doctest: +ELLIPSIS array([ 0.81..., 0.81..., 1.24...]) - >>> scaler.transform(X) # doctest: +ELLIPSIS + >>> scaler.transform(X_train) # doctest: +ELLIPSIS array([[ 0. ..., -1.22..., 1.33...], [ 1.22..., 0. ..., -0.26...], [-1.22..., 1.22..., -1.06...]]) @@ -93,7 +98,8 @@ This class is hence suitable for use in the early steps of a The scaler instance can then be used on new data to transform it the same way it did on the training set:: - >>> scaler.transform([[-1., 1., 0.]]) # doctest: +ELLIPSIS + >>> X_test = [[-1., 1., 0.]] + >>> scaler.transform(X_test) # doctest: +ELLIPSIS array([[-2.44..., 1.22..., -0.26...]]) It is possible to disable either centering or scaling by either diff --git a/examples/preprocessing/plot_all_scaling.py b/examples/preprocessing/plot_all_scaling.py index 9df73dfe71aea..fbb72b52e346b 100755 --- a/examples/preprocessing/plot_all_scaling.py +++ b/examples/preprocessing/plot_all_scaling.py @@ -31,6 +31,7 @@ """ # Author: Raghav RV +# Guillaume Lemaitre # Thomas Unterthiner # License: BSD 3 clause @@ -42,7 +43,7 @@ import matplotlib as mpl from matplotlib import pyplot as plt -from matplotlib import cm, gridspec +from matplotlib import cm from sklearn.preprocessing import MinMaxScaler from sklearn.preprocessing import minmax_scale @@ -67,14 +68,14 @@ distributions = OrderedDict(( ('Unscaled data', X), - ('Data after min-max scaling', - MinMaxScaler().fit_transform(X)), + ('Data after standard scaling', + StandardScaler().fit_transform(X)), ('Data after robust scaling', RobustScaler(quantile_range=(25, 75)).fit_transform(X)), + ('Data after min-max scaling', + MinMaxScaler().fit_transform(X)), ('Data after max-abs scaling', MaxAbsScaler().fit_transform(X)), - ('Data after standard scaling', - StandardScaler().fit_transform(X)), ('Data after sample-wise L2 normalizing', Normalizer().fit_transform(X)), ('Data after quantile transformation (uniform pdf)', @@ -87,12 +88,50 @@ y = minmax_scale(y_full) # To make colors corresponding to the target), -def plot_distribution(axes, X, y, hist_nbins=50, plot_title="", size=(15, 10), +def create_axes(figsize=(8, 8)): + plt.figure(figsize=figsize) + + # define the axis for the first plot + left, width = 0.1, 0.22 + bottom, height = 0.1, 0.2 + bottom_h = left_h = left + width + 0.02 + + rect_scatter = [left, bottom, width, height] + rect_histx = [left, bottom_h, width, 0.1] + rect_histy = [left_h, bottom, 0.1, height] + + ax_scatter = plt.axes(rect_scatter) + ax_histx = plt.axes(rect_histx) + ax_histy = plt.axes(rect_histy) + + # define the axis for the zoomed-in plot + left = width + left + 0.2 + left_h = left + width + 0.02 + + rect_scatter = [left, bottom, width, height] + rect_histx = [left, bottom_h, width, 0.1] + rect_histy = [left_h, bottom, 0.1, height] + + ax_scatter_zoom = plt.axes(rect_scatter) + ax_histx_zoom = plt.axes(rect_histx) + ax_histy_zoom = plt.axes(rect_histy) + + # define the axis for the colorbar + left, width = width + left + 0.13, 0.01 + + rect_colorbar = [left, bottom, width, height] + ax_colorbar = plt.axes(rect_colorbar) + + return ((ax_scatter, ax_histy, ax_histx), + (ax_scatter_zoom, ax_histy_zoom, ax_histx_zoom), + ax_colorbar) + + +def plot_distribution(axes, X, y, hist_nbins=50, title="", X_label="", y_label=""): - ax, hist_X1, hist_X0, empty = axes - empty.axis('off') + ax, hist_X1, hist_X0 = axes - ax.set_title(plot_title, fontsize=12) + ax.set_title(title) ax.set_xlabel(X_label) ax.set_ylabel(y_label) @@ -101,10 +140,13 @@ def plot_distribution(axes, X, y, hist_nbins=50, plot_title="", size=(15, 10), ax.scatter(X[:, 0], X[:, 1], alpha=0.5, marker='o', s=5, lw=0, c=colors) # Removing the top and the right spine for aesthetics + # make nice axis layout ax.spines['top'].set_visible(False) ax.spines['right'].set_visible(False) ax.get_xaxis().tick_bottom() ax.get_yaxis().tick_left() + ax.spines['left'].set_position(('outward', 10)) + ax.spines['bottom'].set_position(('outward', 10)) # Histogram for axis X1 (feature 5) hist_X1.set_ylim(ax.get_ylim()) @@ -114,52 +156,109 @@ def plot_distribution(axes, X, y, hist_nbins=50, plot_title="", size=(15, 10), # Histogram for axis X0 (feature 0) hist_X0.set_xlim(ax.get_xlim()) - hist_X0.invert_yaxis() hist_X0.hist(X[:, 0], bins=hist_nbins, orientation='vertical', color='grey', ec='grey') hist_X0.axis('off') -n_dist = len(distributions) -fig = plt.figure(figsize=(15, n_dist * 8 + 1)) -gs = gridspec.GridSpec(n_dist * 2 + 1, 5, - width_ratios=[5, 1, 0.1, 5, 1], wspace=0.3, - height_ratios=[5, 1] * n_dist + [0.4], - hspace=0.4) -subplots = list(plt.subplot(g) for g in gs) - -for i, (title, X) in enumerate(distributions.items()): - offset = 10 * i - # Distribution with all outliers - axes = subplots[offset:offset + 2] + subplots[offset + 5:offset + 7] - plot_distribution(axes, X, y, hist_nbins=200, - plot_title=title + " including outliers\n", - X_label="Median Income", y_label="Number of households") +############################################################################### +# Two plots will be shown for each scaler/normalizer/transformer. The left +# figure will show a scatter plot of the full data set while the right figure +# will exclude the extreme values considering only 99 % of the data set, +# excluding marginal outliers. In addition, the marginal distributions for each +# feature will be shown on the side of the scatter plot. - # Some blank vertical space between two plots so they don't overlap - subplots[offset + 2].axis('off') - subplots[offset + 7].axis('off') +def make_plot(item_idx): + title, X = distributions.items()[item_idx] + ax_zoom_out, ax_zoom_in, ax_colorbar = create_axes() + axarr = (ax_zoom_out, ax_zoom_in) + plot_distribution(axarr[0], X, y, hist_nbins=200, + title=title + ' including outliers', + X_label="Median Income", y_label="Number of households") + + # zoom-in zoom_in_percentile_range = (0, 99) - # Distribution with extreme outliers removed cutoffs_X0 = np.percentile(X[:, 0], zoom_in_percentile_range) cutoffs_X1 = np.percentile(X[:, 1], zoom_in_percentile_range) non_outliers_mask = ( np.all(X > [cutoffs_X0[0], cutoffs_X1[0]], axis=1) & np.all(X < [cutoffs_X0[1], cutoffs_X1[1]], axis=1)) - axes = subplots[offset + 3:offset + 5] + subplots[offset + 8:offset + 10] - plot_distribution(axes, X[non_outliers_mask], y[non_outliers_mask], + plot_distribution(axarr[1], X[non_outliers_mask], y[non_outliers_mask], hist_nbins=50, - plot_title=(title + - "\nZoomed-in at percentile range %s" - % str(zoom_in_percentile_range)), + title=title + '\nZoomed-in at percentile range (0, 99)', X_label="Median Income", y_label="Number of households") -# Plot a heatmap legend for the y, combining a row of 4 cols -heatmap_legend_ax = plt.subplot(gs[-5:]) -norm = mpl.colors.Normalize(y_full.min(), y_full.max()) -mpl.colorbar.ColorbarBase(heatmap_legend_ax, cmap=cm.plasma_r, - norm=norm, orientation='horizontal', - label='Color mapping for values of y') + norm = mpl.colors.Normalize(y_full.min(), y_full.max()) + mpl.colorbar.ColorbarBase(ax_colorbar, cmap=cm.plasma_r, + norm=norm, orientation='vertical', + label='Color mapping for values of y') + + +############################################################################### +# A large majority of the samples in the original data set are compacted to a +# specific range, [0, 6] for the 1st feature and [0, 10] for the second +# feature. However, as shown on the right figure, there is some marginal +# outliers which might alterate the learning procedure of the some machine +# learning algorithms. Therefore, depending of the application, a specific +# pre-processing is beneficial. In the following, we present some insights and +# behaviors of those pre-processing methods, with the presence of marginal +# outliers. + +make_plot(0) + +############################################################################### +# The ``StandardScaler`` removes the mean and scale the data to a unit +# variance. However, the outliers have an influence when computing the +# empirical mean and standard deviation which shrink the range of the feature +# values as shown in the left figure below. + +make_plot(1) + +############################################################################### +# Unlike, the ``StandardScaler``, the statistics (i.e. median, 1st and 3rd +# quartiles) computed to scale the data set will not be influenced by marginal +# outliers. Consequently, the range of the feature values is larger than in the +# previous example, as shown in the zoomed-in figure. Note that the outliers +# remain far from the inliers. + +make_plot(2) + + +############################################################################### +# The ``MinMaxScaler`` rescales the data set such that all feature values are +# in the range [0, 1] as shown in the right figure below. However, this scaling +# compress all inliers in the narrow range [0, 0.005]. + +make_plot(3) + +############################################################################### +# The ``MaxAbsScaler`` differs from the previous scaler such that the absolute +# values are mapped in the range [0, 1]. Therefore, in the current example, +# there is no observable difference since the feature values are originally +# positive. + +make_plot(4) + +############################################################################### +# The ``Normalizer`` rescales each sample will scale to a unit norm. It can be +# seen on both figures below where all samples are mapped to the unit circle. + +make_plot(5) + +############################################################################### +# The ``QuantileNormalizer`` applies a non-linear transformation such that the +# probability density function of each feature will be mapped to a uniform +# distribution. In this case, all the data will be mapped in the range [0, 1], +# even the outliers which cannot be distinguished anymore from the inliers. + +make_plot(6) + +############################################################################### +# The ``QuantileNormalizer`` has an additional ``output_distribution`` +# parameter allowing to match a Gaussian distribution instead of a normal +# distribution. + +make_plot(7) plt.show() From 7871513a0cb29e1e07a90afe654a4e8d81d1890a Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 13 Apr 2017 18:01:24 +0200 Subject: [PATCH 084/106] FIX/DOC remove title --- examples/preprocessing/plot_all_scaling.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/examples/preprocessing/plot_all_scaling.py b/examples/preprocessing/plot_all_scaling.py index fbb72b52e346b..56f68fc7b098c 100755 --- a/examples/preprocessing/plot_all_scaling.py +++ b/examples/preprocessing/plot_all_scaling.py @@ -88,7 +88,7 @@ y = minmax_scale(y_full) # To make colors corresponding to the target), -def create_axes(figsize=(8, 8)): +def create_axes(figsize=(16, 8)): plt.figure(figsize=figsize) # define the axis for the first plot @@ -169,11 +169,10 @@ def plot_distribution(axes, X, y, hist_nbins=50, title="", def make_plot(item_idx): - title, X = distributions.items()[item_idx] + _, X = distributions.items()[item_idx] ax_zoom_out, ax_zoom_in, ax_colorbar = create_axes() axarr = (ax_zoom_out, ax_zoom_in) plot_distribution(axarr[0], X, y, hist_nbins=200, - title=title + ' including outliers', X_label="Median Income", y_label="Number of households") # zoom-in @@ -186,7 +185,6 @@ def make_plot(item_idx): np.all(X < [cutoffs_X0[1], cutoffs_X1[1]], axis=1)) plot_distribution(axarr[1], X[non_outliers_mask], y[non_outliers_mask], hist_nbins=50, - title=title + '\nZoomed-in at percentile range (0, 99)', X_label="Median Income", y_label="Number of households") norm = mpl.colors.Normalize(y_full.min(), y_full.max()) From 52e4edf3fad693aa9068c8caf1f1ed7792f22b70 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 13 Apr 2017 18:58:21 +0200 Subject: [PATCH 085/106] FIX change the scaling of the figure --- examples/preprocessing/plot_all_scaling.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/examples/preprocessing/plot_all_scaling.py b/examples/preprocessing/plot_all_scaling.py index 56f68fc7b098c..69bfa1881a400 100755 --- a/examples/preprocessing/plot_all_scaling.py +++ b/examples/preprocessing/plot_all_scaling.py @@ -88,7 +88,7 @@ y = minmax_scale(y_full) # To make colors corresponding to the target), -def create_axes(figsize=(16, 8)): +def create_axes(figsize=(24, 24)): plt.figure(figsize=figsize) # define the axis for the first plot @@ -98,7 +98,7 @@ def create_axes(figsize=(16, 8)): rect_scatter = [left, bottom, width, height] rect_histx = [left, bottom_h, width, 0.1] - rect_histy = [left_h, bottom, 0.1, height] + rect_histy = [left_h, bottom, 0.05, height] ax_scatter = plt.axes(rect_scatter) ax_histx = plt.axes(rect_histx) @@ -110,7 +110,7 @@ def create_axes(figsize=(16, 8)): rect_scatter = [left, bottom, width, height] rect_histx = [left, bottom_h, width, 0.1] - rect_histy = [left_h, bottom, 0.1, height] + rect_histy = [left_h, bottom, 0.05, height] ax_scatter_zoom = plt.axes(rect_scatter) ax_histx_zoom = plt.axes(rect_histx) @@ -258,5 +258,4 @@ def make_plot(item_idx): # distribution. make_plot(7) - plt.show() From 28cc2af34aeda57833e7eb8d60fee05e70606ca0 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 19 Apr 2017 15:17:22 +0200 Subject: [PATCH 086/106] FIX plotting layout --- examples/preprocessing/plot_all_scaling.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/examples/preprocessing/plot_all_scaling.py b/examples/preprocessing/plot_all_scaling.py index 69bfa1881a400..8167b5171669c 100755 --- a/examples/preprocessing/plot_all_scaling.py +++ b/examples/preprocessing/plot_all_scaling.py @@ -88,13 +88,14 @@ y = minmax_scale(y_full) # To make colors corresponding to the target), -def create_axes(figsize=(24, 24)): +def create_axes(figsize=(8, 8)): plt.figure(figsize=figsize) # define the axis for the first plot left, width = 0.1, 0.22 - bottom, height = 0.1, 0.2 - bottom_h = left_h = left + width + 0.02 + bottom, height = 0.1, 0.7 + bottom_h = height + 0.15 + left_h = left + width + 0.02 rect_scatter = [left, bottom, width, height] rect_histx = [left, bottom_h, width, 0.1] From 6cdf964059de087a433a00be426ec54135274347 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 19 Apr 2017 15:52:37 +0200 Subject: [PATCH 087/106] FIX ratio w/h --- examples/preprocessing/plot_all_scaling.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/preprocessing/plot_all_scaling.py b/examples/preprocessing/plot_all_scaling.py index 8167b5171669c..23269870dce1f 100755 --- a/examples/preprocessing/plot_all_scaling.py +++ b/examples/preprocessing/plot_all_scaling.py @@ -88,7 +88,7 @@ y = minmax_scale(y_full) # To make colors corresponding to the target), -def create_axes(figsize=(8, 8)): +def create_axes(figsize=(8, 6)): plt.figure(figsize=figsize) # define the axis for the first plot From 58c64c21045bba1c633e8eac28ccaa3a5778ef2c Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Tue, 18 Apr 2017 16:03:44 +0200 Subject: [PATCH 088/106] Reorder and reword the plot_all_scaling example --- examples/preprocessing/plot_all_scaling.py | 168 +++++++++++++-------- 1 file changed, 104 insertions(+), 64 deletions(-) diff --git a/examples/preprocessing/plot_all_scaling.py b/examples/preprocessing/plot_all_scaling.py index 23269870dce1f..3a27cf8a277fc 100755 --- a/examples/preprocessing/plot_all_scaling.py +++ b/examples/preprocessing/plot_all_scaling.py @@ -6,13 +6,15 @@ Compare the effect of different scalers on data with outliers ============================================================= -The feature 0 and feature 5 of California housing dataset are outside of the -typical range [0, 1] and contain large outliers. These two characteristics lead -to difficulties to visualize the data and, more importantly, they can degrade -the fitting procedure of most of machine learning algorithms. - -Indeed many estimators assume that each feature takes values spread around or -close to zero and more importantly that all features vary on comparable +Feature 0 (median income in a block) and feature 5 (number of households) of +the California housing dataset have very different scales and contain some very +large outliers. These two characteristics lead to difficulties to visualize the +data and, more importantly, they can degrade the predictive performance of many +machine learning algorithms. Unscaled data can also slow down or even prevent +the convergence of many gradient-based estimators. + +Indeed many estimators are designed with the assumption that each feature takes +values close to zero or more importantly that all features vary on comparable scales. In particular metric-based and gradient-based estimators often assume approximately standardized data (centered features with unit variances). A notable exception are decision tree-based estimators that are robust to @@ -21,12 +23,15 @@ This example uses different scalers, transformers and normalizers to bring the data within a pre-defined range. -Scalers are linear (or more exactly affine) transformations and differ from -each other in the way to estimate the parameters used to shift and scale each -feature. ``QuantileTransformer`` provides a non-linear transformation in which -distances between marginal outliers and inliers are shrunk. Unlike the -previous transformations, normalization refers to a per sample transformation -instead of a per feature transformation. +Scalers are linear (or more exactly affine) transformers and differ from each +other in the way to estimate the parameters used to shift and scale each +feature. + +``QuantileTransformer`` provides a non-linear transformation in which distances +between marginal outliers and inliers are shrunk. + +Unlike the previous transformations, normalization refers to a per sample +transformation instead of a per feature transformation. """ @@ -70,20 +75,21 @@ ('Unscaled data', X), ('Data after standard scaling', StandardScaler().fit_transform(X)), - ('Data after robust scaling', - RobustScaler(quantile_range=(25, 75)).fit_transform(X)), ('Data after min-max scaling', MinMaxScaler().fit_transform(X)), ('Data after max-abs scaling', MaxAbsScaler().fit_transform(X)), - ('Data after sample-wise L2 normalizing', - Normalizer().fit_transform(X)), + ('Data after robust scaling', + RobustScaler(quantile_range=(25, 75)).fit_transform(X)), ('Data after quantile transformation (uniform pdf)', QuantileTransformer(output_distribution='uniform') .fit_transform(X)), ('Data after quantile transformation (gaussian pdf)', QuantileTransformer(output_distribution='normal') - .fit_transform(X)))) + .fit_transform(X)), + ('Data after sample-wise L2 normalizing', + Normalizer().fit_transform(X)), +)) y = minmax_scale(y_full) # To make colors corresponding to the target), @@ -129,12 +135,12 @@ def create_axes(figsize=(8, 6)): def plot_distribution(axes, X, y, hist_nbins=50, title="", - X_label="", y_label=""): + x0_label="", x1_label=""): ax, hist_X1, hist_X0 = axes ax.set_title(title) - ax.set_xlabel(X_label) - ax.set_ylabel(y_label) + ax.set_xlabel(x0_label) + ax.set_ylabel(x1_label) # The scatter plot colors = cm.plasma_r(y) @@ -170,11 +176,12 @@ def plot_distribution(axes, X, y, hist_nbins=50, title="", def make_plot(item_idx): - _, X = distributions.items()[item_idx] + _, X = list(distributions.items())[item_idx] ax_zoom_out, ax_zoom_in, ax_colorbar = create_axes() axarr = (ax_zoom_out, ax_zoom_in) plot_distribution(axarr[0], X, y, hist_nbins=200, - X_label="Median Income", y_label="Number of households") + x0_label="Median Income", + x1_label="Number of households") # zoom-in zoom_in_percentile_range = (0, 99) @@ -186,7 +193,8 @@ def make_plot(item_idx): np.all(X < [cutoffs_X0[1], cutoffs_X1[1]], axis=1)) plot_distribution(axarr[1], X[non_outliers_mask], y[non_outliers_mask], hist_nbins=50, - X_label="Median Income", y_label="Number of households") + x0_label="Median Income", + x1_label="Number of households") norm = mpl.colors.Normalize(y_full.min(), y_full.max()) mpl.colorbar.ColorbarBase(ax_colorbar, cmap=cm.plasma_r, @@ -194,69 +202,101 @@ def make_plot(item_idx): label='Color mapping for values of y') -############################################################################### -# A large majority of the samples in the original data set are compacted to a -# specific range, [0, 6] for the 1st feature and [0, 10] for the second -# feature. However, as shown on the right figure, there is some marginal -# outliers which might alterate the learning procedure of the some machine -# learning algorithms. Therefore, depending of the application, a specific -# pre-processing is beneficial. In the following, we present some insights and -# behaviors of those pre-processing methods, with the presence of marginal -# outliers. +######################################################################## +# Original data +# ------------- +# +# The following plot displays the original data distribution in the left panel +# and the zoomed in version in the right panel. A large majority of the samples +# are compacted to a specific range, [0, 6] for the median income and [0, 10] +# for the number of households. Note that there are some marginal outliers +# (some blocks have more than 1200 households). Therefore a specific +# pre-processing can be very beneficial depending of the application. In the +# following, we present some insights and behaviors of those pre-processing +# methods in the presence of marginal outliers. make_plot(0) -############################################################################### -# The ``StandardScaler`` removes the mean and scale the data to a unit -# variance. However, the outliers have an influence when computing the -# empirical mean and standard deviation which shrink the range of the feature -# values as shown in the left figure below. +############################################################################## +# StandardScaler +# -------------- +# +# ``StandardScaler`` removes the mean and scale the data to a unit variance. +# However, the outliers have an influence when computing the empirical mean and +# standard deviation which shrink the range of the feature values as shown in +# the left figure below. make_plot(1) -############################################################################### -# Unlike, the ``StandardScaler``, the statistics (i.e. median, 1st and 3rd -# quartiles) computed to scale the data set will not be influenced by marginal -# outliers. Consequently, the range of the feature values is larger than in the -# previous example, as shown in the zoomed-in figure. Note that the outliers -# remain far from the inliers. - -make_plot(2) - - -############################################################################### -# The ``MinMaxScaler`` rescales the data set such that all feature values are -# in the range [0, 1] as shown in the right figure below. However, this scaling +############################################################################## +# MinMaxScaler +# ------------ +# +# ``MinMaxScaler`` rescales the data set such that all feature values are in +# the range [0, 1] as shown in the right figure below. However, this scaling # compress all inliers in the narrow range [0, 0.005]. -make_plot(3) +make_plot(2) ############################################################################### -# The ``MaxAbsScaler`` differs from the previous scaler such that the absolute +# MaxAbsScaler +# ------------ +# +# ``MaxAbsScaler`` differs from the previous scaler such that the absolute # values are mapped in the range [0, 1]. Therefore, in the current example, # there is no observable difference since the feature values are originally # positive. -make_plot(4) +make_plot(3) -############################################################################### -# The ``Normalizer`` rescales each sample will scale to a unit norm. It can be -# seen on both figures below where all samples are mapped to the unit circle. +####################################################################### +# RobustScaler +# ------------ +# +# Unlike the previous scalers, the centering and scaling statistics of this +# scaler are based on percentiles and are therefore not influenced by a few +# number of very large marginal outliers. Consequently, the resulting range of +# the transformed feature values is larger than for the previous scalers and +# more importantly are approximately similar: for both features most of the +# transformed values lie in a [-2, 3] range as seen in the zoomed-in figure. +# Note that the outliers themselves are still present in the transformed data. +# If trimming the outliers is desirable, a non-linear transformation is +# required (see below). -make_plot(5) +make_plot(4) -############################################################################### -# The ``QuantileNormalizer`` applies a non-linear transformation such that the +############################################################################## +# QuantileTransformer (Uniform output) +# ----------------------------------- +# +# ``QuantileTransformer`` applies a non-linear transformation such that the # probability density function of each feature will be mapped to a uniform # distribution. In this case, all the data will be mapped in the range [0, 1], # even the outliers which cannot be distinguished anymore from the inliers. +make_plot(5) + +############################################################################## +# QuantileTransformer (Gaussian output) +# ------------------------------------- +# +# ``QuantileTransformer`` has an additional ``output_distribution`` parameter +# allowing to match a Gaussian distribution instead of a normal distribution. +# Note that this non-parametetric transformer introduces saturation artifacts +# for extreme values. + make_plot(6) -############################################################################### -# The ``QuantileNormalizer`` has an additional ``output_distribution`` -# parameter allowing to match a Gaussian distribution instead of a normal -# distribution. +############################################################################## +# Normalizer +# ---------- +# +# The ``Normalizer`` rescales the vector for each sample to have unit norm, +# independently of the distribution of the samples. It can be seen on both +# figures below where all samples are mapped to the unit circle. In our example +# the two selected features have only positive values therefore the transformed +# only lie in the positive quadrant. This would not be the case if some +# original features had a mix of positive and negative values. make_plot(7) plt.show() From 1a181fa79e39f5bdfdac8830564e49daee68d73f Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Thu, 20 Apr 2017 10:44:40 +0200 Subject: [PATCH 089/106] Fix aspect ratio and better explanations in the plot_all_scaling.py example --- examples/preprocessing/plot_all_scaling.py | 63 +++++++++++++++------- 1 file changed, 44 insertions(+), 19 deletions(-) diff --git a/examples/preprocessing/plot_all_scaling.py b/examples/preprocessing/plot_all_scaling.py index 3a27cf8a277fc..197ae92702b71 100755 --- a/examples/preprocessing/plot_all_scaling.py +++ b/examples/preprocessing/plot_all_scaling.py @@ -7,11 +7,13 @@ ============================================================= Feature 0 (median income in a block) and feature 5 (number of households) of -the California housing dataset have very different scales and contain some very -large outliers. These two characteristics lead to difficulties to visualize the -data and, more importantly, they can degrade the predictive performance of many -machine learning algorithms. Unscaled data can also slow down or even prevent -the convergence of many gradient-based estimators. +the `California housing dataset +`_ have very +different scales and contain some very large outliers. These two +characteristics lead to difficulties to visualize the data and, more +importantly, they can degrade the predictive performance of many machine +learning algorithms. Unscaled data can also slow down or even prevent the +convergence of many gradient-based estimators. Indeed many estimators are designed with the assumption that each feature takes values close to zero or more importantly that all features vary on comparable @@ -33,6 +35,9 @@ Unlike the previous transformations, normalization refers to a per sample transformation instead of a per feature transformation. +The following code is a bit verbose, feel free to jump directly to the analysis +of the results_. + """ # Author: Raghav RV @@ -94,7 +99,7 @@ y = minmax_scale(y_full) # To make colors corresponding to the target), -def create_axes(figsize=(8, 6)): +def create_axes(figsize=(16, 6)): plt.figure(figsize=figsize) # define the axis for the first plot @@ -203,12 +208,14 @@ def make_plot(item_idx): ######################################################################## +# .. _results: +# # Original data # ------------- # # The following plot displays the original data distribution in the left panel # and the zoomed in version in the right panel. A large majority of the samples -# are compacted to a specific range, [0, 6] for the median income and [0, 10] +# are compacted to a specific range, [0, 10] for the median income and [0, 6] # for the number of households. Note that there are some marginal outliers # (some blocks have more than 1200 households). Therefore a specific # pre-processing can be very beneficial depending of the application. In the @@ -217,39 +224,50 @@ def make_plot(item_idx): make_plot(0) -############################################################################## +####################################################################### # StandardScaler # -------------- # # ``StandardScaler`` removes the mean and scale the data to a unit variance. # However, the outliers have an influence when computing the empirical mean and # standard deviation which shrink the range of the feature values as shown in -# the left figure below. +# the left figure below. Note in particular that because the outliers on each +# features have different magnitudes, the spread of the transformed data on +# each feature is very different: most of the data lie in the [-2, 4] range for +# the transformed median income feature while the same data is squeezed in the +# smaller [-0.2, 0.2] range for the transformed number of households. +# +# ``StandardScaler`` therefore cannot guarantee balanced feature scales in the +# presence of outliers. make_plot(1) -############################################################################## +########################################################################## # MinMaxScaler # ------------ # # ``MinMaxScaler`` rescales the data set such that all feature values are in # the range [0, 1] as shown in the right figure below. However, this scaling -# compress all inliers in the narrow range [0, 0.005]. +# compress all inliers in the narrow range [0, 0.005] for the transformed +# number of households. +# +# As ``StandardScaler``, ``MinMaxScaler`` is very sensitive to the presence of +# outliers. make_plot(2) -############################################################################### +############################################################################# # MaxAbsScaler # ------------ # # ``MaxAbsScaler`` differs from the previous scaler such that the absolute -# values are mapped in the range [0, 1]. Therefore, in the current example, -# there is no observable difference since the feature values are originally -# positive. +# values are mapped in the range [0, 1]. On positive only data this scalers +# behave similarly to ``MinMaxScaler`` and therefore also suffers from the +# presence of large outliers. make_plot(3) -####################################################################### +############################################################################## # RobustScaler # ------------ # @@ -265,14 +283,21 @@ def make_plot(item_idx): make_plot(4) -############################################################################## -# QuantileTransformer (Uniform output) -# ----------------------------------- +################################################################### +# QuantileTransformer (uniform output) +# ------------------------------------ # # ``QuantileTransformer`` applies a non-linear transformation such that the # probability density function of each feature will be mapped to a uniform # distribution. In this case, all the data will be mapped in the range [0, 1], # even the outliers which cannot be distinguished anymore from the inliers. +# +# As ``RobustScaler``, ``QuantileTransformer`` is robust to outliers in the +# sense that adding or removing outliers in the training set will yield +# approximately the same transformation on held out data. But contrary to +# ``RobustScaler``, ``QuantileTransformer`` will also automatically collapse +# any outlier by setting them to the a priori defined range boundaries (0 and +# 1). make_plot(5) From cb04d53d06e8cfec952db4e2ade9e1901a9013ca Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Thu, 20 Apr 2017 11:17:06 +0200 Subject: [PATCH 090/106] Fix broken link and remove useless sentence --- doc/modules/preprocessing.rst | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index 89dbfc0832f41..0477471c6ab41 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -14,8 +14,7 @@ In general, learning algorithm benefit from standardization of the data set. If some outliers are present in the set, robust scalers or transformers are more appropriate. The behaviors of the different scalers, transformers, and normalizers on a dataset containing marginal outliers is highlighted in -ref:`sphx_glr_auto_examples_preprocessing_plot_all_scaling.py`. In the -following, a description of these methods is given. +:ref:`sphx_glr_auto_examples_preprocessing_plot_all_scaling.py`. .. _preprocessing_scaler: From eac7071065d0b11feccde2a95bb28a45e213921e Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 20 Apr 2017 13:22:10 +0200 Subject: [PATCH 091/106] FIX fix couples of spelling --- examples/preprocessing/plot_all_scaling.py | 31 +++++++++++----------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/examples/preprocessing/plot_all_scaling.py b/examples/preprocessing/plot_all_scaling.py index 197ae92702b71..42a92a61dcb89 100755 --- a/examples/preprocessing/plot_all_scaling.py +++ b/examples/preprocessing/plot_all_scaling.py @@ -17,12 +17,12 @@ Indeed many estimators are designed with the assumption that each feature takes values close to zero or more importantly that all features vary on comparable -scales. In particular metric-based and gradient-based estimators often assume +scales. In particular, metric-based and gradient-based estimators often assume approximately standardized data (centered features with unit variances). A notable exception are decision tree-based estimators that are robust to arbitrary scaling of the data. -This example uses different scalers, transformers and normalizers to bring the +This example uses different scalers, transformers, and normalizers to bring the data within a pre-defined range. Scalers are linear (or more exactly affine) transformers and differ from each @@ -96,7 +96,8 @@ Normalizer().fit_transform(X)), )) -y = minmax_scale(y_full) # To make colors corresponding to the target), +# scale the output between 0 and 1 for the colorbar +y = minmax_scale(y_full) def create_axes(figsize=(16, 6)): @@ -214,10 +215,10 @@ def make_plot(item_idx): # ------------- # # The following plot displays the original data distribution in the left panel -# and the zoomed in version in the right panel. A large majority of the samples +# and the zoomed-in version in the right panel. A large majority of the samples # are compacted to a specific range, [0, 10] for the median income and [0, 6] # for the number of households. Note that there are some marginal outliers -# (some blocks have more than 1200 households). Therefore a specific +# (some blocks have more than 1200 households). Therefore, a specific # pre-processing can be very beneficial depending of the application. In the # following, we present some insights and behaviors of those pre-processing # methods in the presence of marginal outliers. @@ -247,7 +248,7 @@ def make_plot(item_idx): # ------------ # # ``MinMaxScaler`` rescales the data set such that all feature values are in -# the range [0, 1] as shown in the right figure below. However, this scaling +# the range [0, 1] as shown in the right panel below. However, this scaling # compress all inliers in the narrow range [0, 0.005] for the transformed # number of households. # @@ -261,8 +262,8 @@ def make_plot(item_idx): # ------------ # # ``MaxAbsScaler`` differs from the previous scaler such that the absolute -# values are mapped in the range [0, 1]. On positive only data this scalers -# behave similarly to ``MinMaxScaler`` and therefore also suffers from the +# values are mapped in the range [0, 1]. On positive only data, this scaler +# behaves similarly to ``MinMaxScaler`` and therefore also suffers from the # presence of large outliers. make_plot(3) @@ -274,8 +275,8 @@ def make_plot(item_idx): # Unlike the previous scalers, the centering and scaling statistics of this # scaler are based on percentiles and are therefore not influenced by a few # number of very large marginal outliers. Consequently, the resulting range of -# the transformed feature values is larger than for the previous scalers and -# more importantly are approximately similar: for both features most of the +# the transformed feature values is larger than for the previous scalers and, +# more importantly, are approximately similar: for both features most of the # transformed values lie in a [-2, 3] range as seen in the zoomed-in figure. # Note that the outliers themselves are still present in the transformed data. # If trimming the outliers is desirable, a non-linear transformation is @@ -306,7 +307,7 @@ def make_plot(item_idx): # ------------------------------------- # # ``QuantileTransformer`` has an additional ``output_distribution`` parameter -# allowing to match a Gaussian distribution instead of a normal distribution. +# allowing to match a Gaussian distribution instead of a uniform distribution. # Note that this non-parametetric transformer introduces saturation artifacts # for extreme values. @@ -318,10 +319,10 @@ def make_plot(item_idx): # # The ``Normalizer`` rescales the vector for each sample to have unit norm, # independently of the distribution of the samples. It can be seen on both -# figures below where all samples are mapped to the unit circle. In our example -# the two selected features have only positive values therefore the transformed -# only lie in the positive quadrant. This would not be the case if some -# original features had a mix of positive and negative values. +# figures below where all samples are mapped onto the unit circle. In our +# example the two selected features have only positive values; therefore the +# transformed data only lie in the positive quadrant. This would not be the +# case if some original features had a mix of positive and negative values. make_plot(7) plt.show() From 37afa44d0991aaed379dbfd92798246ca1845749 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Sun, 23 Apr 2017 13:21:51 +0200 Subject: [PATCH 092/106] FIX comments joel --- doc/modules/preprocessing.rst | 31 +++++++++++++++++----- doc/whats_new.rst | 13 ++++----- examples/preprocessing/plot_all_scaling.py | 19 ++++++------- 3 files changed, 42 insertions(+), 21 deletions(-) diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index 0477471c6ab41..8f36d29d30ca8 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -10,7 +10,7 @@ The ``sklearn.preprocessing`` package provides several common utility functions and transformer classes to change raw feature vectors into a representation that is more suitable for the downstream estimators. -In general, learning algorithm benefit from standardization of the data set. If +In general, learning algorithms benefit from standardization of the data set. If some outliers are present in the set, robust scalers or transformers are more appropriate. The behaviors of the different scalers, transformers, and normalizers on a dataset containing marginal outliers is highlighted in @@ -268,22 +268,38 @@ methods. It does, however, distort correlations and distances within and across features. :class:`QuantileTransformer` and :func:`quantile_transform` provide a -non-parametric transformation based the quantile function to map the data to a -uniform distribution with values between 0 and 1:: +non-parametric transformation based on the quantile function to map the data to +a uniform distribution with values between 0 and 1:: >>> from sklearn.datasets import load_iris >>> from sklearn.model_selection import train_test_split >>> iris = load_iris() >>> X, y = iris.data, iris.target >>> X_train, X_test, y_train, y_test = train_test_split(X, y) - >>> quantile_transformer = preprocessing.QuantileTransformer() + >>> quantile_transformer = preprocessing.QuantileTransformer( + ... smoothing_noise=1e-12) >>> X_train_trans = quantile_transformer.fit_transform(X_train) >>> X_test_trans = quantile_transformer.transform(X_test) + >>> np.percentile(X_train[:, 0], [0, 25, 50, 75, 100]) + array([ 4.3, 5.2, 5.8, 6.5, 7.9]) + >>> np.percentile(X_train_trans[:, 0], [0, 25, 50, 75, 100]) + ... # doctest: +ELLIPSIS + array([...]) + >>> np.percentile(X_test[:, 0], [0, 25, 50, 75, 100]) + array([ 4.4 , 5. , 5.65, 6.2 , 7.7 ]) + >>> np.percentile(X_test_trans[:, 0], [0, 25, 50, 75, 100]) + ... # doctest: +ELLIPSIS + array([...]) + It is also possible to map the transformed data to a normal distribution by setting ``output_distribution='normal'``:: - >>> X_trans = preprocessing.quantile_transform(X, output_distribution='normal') + >>> quantile_transformer = preprocessing.QuantileTransformer( + ... smoothing_noise=1e-12, output_distribution='normal') + >>> X_trans = quantile_transformer.fit_transform(X) + >>> quantile_transformer.quantiles_ # doctest: + ELLIPSIS + array([...]) Thus the median of the input becomes the mean of the output, centered at 0. The normal output is clipped so that the input's minimum and maximum --- @@ -291,7 +307,10 @@ corresponding to the 1e-7 and 1 - 1e-7 quantiles respectively --- do not become infinite under the transformation. :class:`QuantileTransformer` provides a ``smoothing_noise`` parameter to make -the interpretation more intuitive when manually checking the transformation. See +the interpretation more intuitive when manually checking the transformation +which particularly useful useful when feature values are replicated +(e.g. prices, units of time, etc.) rather than truly discrete or truly +continuous. See :ref:`sphx_glr_auto_examples_preprocessing_plot_smoothing_noise_quantile_transform.py` .. _preprocessing_normalization: diff --git a/doc/whats_new.rst b/doc/whats_new.rst index 0ada780eb1b02..16d7068726970 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -56,12 +56,13 @@ New features multinomial logistic loss, and behaves marginally better than 'sag' during the first epochs of ridge and logistic regression. By `Arthur Mensch`_. - - Added :class:`preprocessing.QuantileNormalizer` class for features - - Added :class:`preprocessing.QuantileTransformer` class for features + + - Added :class:`preprocessing.QuantileTransformer` class and + :func:`preprocessing.quantile_transform` function for features normalization based on quantiles. :issue:`8363` by :user:`Denis Engemann `, - :user:`Guillaume Lemaitre `, `Olivier Grisel`_, - `Raghav RV`_, and :user:`Thierry Guillemot `. + :user:`Guillaume Lemaitre `, `Olivier Grisel`_, `Raghav RV`_, + and :user:`Thierry Guillemot `. Enhancements ............ @@ -167,7 +168,7 @@ Enhancements - Add ``sample_weight`` parameter to :func:`metrics.cohen_kappa_score` by Victor Poughon. - - In :class:`gaussian_process.GaussianProcessRegressor`, method ``predict`` + - In :class:`gaussian_process.GaussianProcessRegressor`, method ``predict`` is a lot faster with ``return_std=True`` by :user:`Hadrien Bertrand `. Bug fixes @@ -260,7 +261,7 @@ Bug fixes multiple inheritance context. :issue:`8316` by :user:`Holger Peters `. - - Fix :func:`sklearn.linear_model.BayesianRidge.fit` to return + - Fix :func:`sklearn.linear_model.BayesianRidge.fit` to return ridge parameter `alpha_` and `lambda_` consistent with calculated coefficients `coef_` and `intercept_`. :issue:`8224` by :user:`Peter Gedeck `. diff --git a/examples/preprocessing/plot_all_scaling.py b/examples/preprocessing/plot_all_scaling.py index 42a92a61dcb89..1a295ade09aff 100755 --- a/examples/preprocessing/plot_all_scaling.py +++ b/examples/preprocessing/plot_all_scaling.py @@ -25,7 +25,7 @@ This example uses different scalers, transformers, and normalizers to bring the data within a pre-defined range. -Scalers are linear (or more exactly affine) transformers and differ from each +Scalers are linear (or more precisely affine) transformers and differ from each other in the way to estimate the parameters used to shift and scale each feature. @@ -214,14 +214,15 @@ def make_plot(item_idx): # Original data # ------------- # -# The following plot displays the original data distribution in the left panel -# and the zoomed-in version in the right panel. A large majority of the samples -# are compacted to a specific range, [0, 10] for the median income and [0, 6] -# for the number of households. Note that there are some marginal outliers -# (some blocks have more than 1200 households). Therefore, a specific -# pre-processing can be very beneficial depending of the application. In the -# following, we present some insights and behaviors of those pre-processing -# methods in the presence of marginal outliers. +# Each transformation is plotted showing two transformed features, with the +# left plot showing the entire dataset, and the right zoomed-in to show the +# dataset without the marginal outliers. A large majority of the samples are +# compacted to a specific range, [0, 10] for the median income and [0, 6] for +# the number of households. Note that there are some marginal outliers (some +# blocks have more than 1200 households). Therefore, a specific pre-processing +# can be very beneficial depending of the application. In the following, we +# present some insights and behaviors of those pre-processing methods in the +# presence of marginal outliers. make_plot(0) From a4719b4c77b94709556730d782befa6e721f401b Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Sun, 23 Apr 2017 14:04:27 +0200 Subject: [PATCH 093/106] FIX/DOC address documentation comments --- doc/modules/preprocessing.rst | 8 +-- examples/preprocessing/plot_all_scaling.py | 6 +-- ...plot_smoothing_noise_quantile_transform.py | 50 +++++++++---------- 3 files changed, 33 insertions(+), 31 deletions(-) diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index 8f36d29d30ca8..5362149caeea6 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -282,12 +282,14 @@ a uniform distribution with values between 0 and 1:: >>> X_test_trans = quantile_transformer.transform(X_test) >>> np.percentile(X_train[:, 0], [0, 25, 50, 75, 100]) - array([ 4.3, 5.2, 5.8, 6.5, 7.9]) + ... # doctest: +ELLIPSIS + array([...]) >>> np.percentile(X_train_trans[:, 0], [0, 25, 50, 75, 100]) ... # doctest: +ELLIPSIS array([...]) >>> np.percentile(X_test[:, 0], [0, 25, 50, 75, 100]) - array([ 4.4 , 5. , 5.65, 6.2 , 7.7 ]) + ... # doctest: +ELLIPSIS + array([...]) >>> np.percentile(X_test_trans[:, 0], [0, 25, 50, 75, 100]) ... # doctest: +ELLIPSIS array([...]) @@ -298,7 +300,7 @@ setting ``output_distribution='normal'``:: >>> quantile_transformer = preprocessing.QuantileTransformer( ... smoothing_noise=1e-12, output_distribution='normal') >>> X_trans = quantile_transformer.fit_transform(X) - >>> quantile_transformer.quantiles_ # doctest: + ELLIPSIS + >>> quantile_transformer.quantiles_ # doctest: +ELLIPSIS array([...]) Thus the median of the input becomes the mean of the output, centered at 0. The diff --git a/examples/preprocessing/plot_all_scaling.py b/examples/preprocessing/plot_all_scaling.py index 1a295ade09aff..7175e27dea2b4 100755 --- a/examples/preprocessing/plot_all_scaling.py +++ b/examples/preprocessing/plot_all_scaling.py @@ -230,11 +230,11 @@ def make_plot(item_idx): # StandardScaler # -------------- # -# ``StandardScaler`` removes the mean and scale the data to a unit variance. +# ``StandardScaler`` removes the mean and scales the data to unit variance. # However, the outliers have an influence when computing the empirical mean and # standard deviation which shrink the range of the feature values as shown in # the left figure below. Note in particular that because the outliers on each -# features have different magnitudes, the spread of the transformed data on +# feature have different magnitudes, the spread of the transformed data on # each feature is very different: most of the data lie in the [-2, 4] range for # the transformed median income feature while the same data is squeezed in the # smaller [-0.2, 0.2] range for the transformed number of households. @@ -280,7 +280,7 @@ def make_plot(item_idx): # more importantly, are approximately similar: for both features most of the # transformed values lie in a [-2, 3] range as seen in the zoomed-in figure. # Note that the outliers themselves are still present in the transformed data. -# If trimming the outliers is desirable, a non-linear transformation is +# If a separate outlier clipping is desirable, a non-linear transformation is # required (see below). make_plot(4) diff --git a/examples/preprocessing/plot_smoothing_noise_quantile_transform.py b/examples/preprocessing/plot_smoothing_noise_quantile_transform.py index f6d6e7b61ac7b..7db05c6575281 100755 --- a/examples/preprocessing/plot_smoothing_noise_quantile_transform.py +++ b/examples/preprocessing/plot_smoothing_noise_quantile_transform.py @@ -7,14 +7,16 @@ The parameter ``smoothing_noise`` can be used if some specific feature values are repeated exactly many times to the point of being predominant in the -dataset. +dataset which particularly useful useful when feature values are replicated +(e.g. prices, units of time, etc.) rather than truly discrete or truly +continuous. Without smoothing noise, the ``QuantileTransformer`` will map those values to some arbitrary value: the highest quantile value for all the inputs with the same value. While this is usually not an issue when ``QuantileTransformer`` is -used as a preprocessing transformer for a subsequent subsequent supervised -estimator, it can lead to surprising results when manually inspecting the -transformed values (e.g. for visualization or reporting). +used as a preprocessing transformer for a subsequent supervised estimator, it +can lead to surprising results when manually inspecting the transformed values +(e.g. for visualization or reporting). The goal of the smoothing noise is to make it possible to map those repeated values to some middle quantile value to make interpretation more intuitive as @@ -37,8 +39,7 @@ def plot_transform_feat_val(ax, transformer, title): - """Plot the full transformation mapping the feature values as well as - a single feature.""" + """Plot the mapping function as well as a specific feature value.""" ref = np.linspace(0, 1, num=N_QUANTILES) ax.plot(transformer.quantiles_, ref) @@ -66,6 +67,24 @@ def plot_transform_feat_val(ax, transformer, title): # ratings for a restaurant. The scale used is ranging from 1 to 5 and # a large number of customers attributed a grade of 3 to the current # restaurant. +# +# By default, the ``QuantileTransformer`` does not apply any smoothing +# noise. When dealing with a data set with a predominant value, this feature +# value can be affected to several quantiles. When provided to the transformer, +# this feature value will be mapped to the largest quantile. In practice, +# machine learning algorithms will usually not be affected by such +# characteristics. However, manual interpretation might be counter intuitive. +# +# From the below plot, we would expect that a vote corresponding to +# the value 3 would be mapped to the median (e.g., 0.5). However, the +# default behaviour of the 'interp' numpy function will map this +# feature value to the greater quantile as shown by the marker in the +# figure. +# +# A solution is to apply a small smoothing noise before computing the +# quantiles. The parameter ``smoothing_noise`` offers this possibility as +# illustrated above. In this case, the marker is centered at the median as +# expected. X = np.array([1] * 2000 + [2] * 1000 + @@ -86,22 +105,3 @@ def plot_transform_feat_val(ax, transformer, title): plot_transform_feat_val(ax2, qt, 'With smoothing') plt.tight_layout() plt.show() - -############################################################################### -# By default, the ``QuantileTransformer`` does not apply any smoothing -# noise. Dealing with dataset with a predominant value, the quantile -# computed for such value will correspond to the largest quantiled. In -# practise, marchine learning algorithms will usually not be affected -# by such characteristics. However, manual interpretation might be -# counter intuitive. -# -# From the above plot, we would expect that a vote corresponding to -# the value 3 would be mapped to the median (e.g., 0.5). However, the -# default behaviour of the 'interp' numpy function will map this -# feature value to the greater quantile as show by the marker in the -# figure. -# -# A solution is to apply a small smoothing noise before the -# computation of the quantiles. The parameter ``smoothing_noise`` offers -# this possibility as illustrated above. -# In this case, the marker is centered at the median as expected. From 07906cc7ba22e7be2a3b9b65505c8a02f0e87993 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Sun, 7 May 2017 19:01:18 +0200 Subject: [PATCH 094/106] FIX address comments joel --- sklearn/preprocessing/data.py | 115 +++++++------ sklearn/preprocessing/tests/test_data.py | 210 +++++++++++++++-------- 2 files changed, 200 insertions(+), 125 deletions(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index af30c2dc5abbd..4ed3f43a85b08 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -247,7 +247,7 @@ class MinMaxScaler(BaseEstimator, TransformerMixin): See also -------- - minmax_scale: Equivalent function without the object oriented API. + minmax_scale: Equivalent function without the estimator API. Notes ----- @@ -502,7 +502,7 @@ class StandardScaler(BaseEstimator, TransformerMixin): See also -------- - scale: Equivalent function without the object oriented API. + scale: Equivalent function without the estimator API. :class:`sklearn.decomposition.PCA` Further removes the linear correlation across features with 'whiten=True'. @@ -712,7 +712,7 @@ class MaxAbsScaler(BaseEstimator, TransformerMixin): See also -------- - maxabs_scale: Equivalent function without the object oriented API. + maxabs_scale: Equivalent function without the estimator API. Notes ----- @@ -934,7 +934,7 @@ class RobustScaler(BaseEstimator, TransformerMixin): See also -------- - robust_scale: Equivalent function without the object oriented API. + robust_scale: Equivalent function without the estimator API. :class:`sklearn.decomposition.PCA` Further removes the linear correlation across features with @@ -1404,7 +1404,7 @@ class Normalizer(BaseEstimator, TransformerMixin): See also -------- - normalize: Equivalent function without the object oriented API. + normalize: Equivalent function without the estimator API. """ def __init__(self, norm='l2', copy=True): @@ -1515,7 +1515,7 @@ class Binarizer(BaseEstimator, TransformerMixin): See also -------- - binarize: Equivalent function without the object oriented API. + binarize: Equivalent function without the estimator API. """ def __init__(self, threshold=0.0, copy=True): @@ -1968,7 +1968,7 @@ class QuantileTransformer(BaseEstimator, TransformerMixin): correlations between variables measured at the same scale but renders variables measured at different scales more directly comparable. - Read more in the :ref:`User Guide `. + Read more in the :ref:`User Guide `. Parameters ---------- @@ -1982,12 +1982,13 @@ class QuantileTransformer(BaseEstimator, TransformerMixin): ignore_implicit_zeros : bool, optional (default=False) Only applies to sparse matrices. If True, the sparse entries of the - matrix are discarded to compute the quantile statistics. If false, + matrix are discarded to compute the quantile statistics. If False, these entries are treated as zeros. subsample : int, optional (default=1e5) Maximum number of samples used to estimate the quantiles for - computational efficiency. + computational efficiency. Note that the subsamplong procedure may + differ for value-identical sparse and dense matrices. smoothing_noise : float, optional Perturbs features at training time before computing quantiles by adding @@ -2003,8 +2004,8 @@ class QuantileTransformer(BaseEstimator, TransformerMixin): noise. copy : boolean, optional, (default=True) - Set to False to perform inplace scaling and avoid a copy (if the input - is already a numpy array). + Set to False to perform inplace transformation and avoid a copy (if the + input is already a numpy array). Attributes ---------- @@ -2018,15 +2019,15 @@ class QuantileTransformer(BaseEstimator, TransformerMixin): -------- >>> import numpy as np >>> from sklearn.preprocessing import QuantileTransformer - >>> RNG = np.random.RandomState(0) - >>> X = np.sort(RNG.normal(loc=0.5, scale=0.25, size=(25, 1)), axis=0) + >>> rng = np.random.RandomState(0) + >>> X = np.sort(rng.normal(loc=0.5, scale=0.25, size=(25, 1)), axis=0) >>> qt = QuantileTransformer(n_quantiles=10, random_state=0) >>> qt.fit_transform(X) # doctest: +ELLIPSIS array([...]) See also -------- - quantile_transform : Equivalent function without the object oriented API. + quantile_transform : Equivalent function without the estimator API. StandardScaler : perform standardization that is faster, but less robust to outliers. RobustScaler : perform robust standardization that removes the influence @@ -2053,8 +2054,8 @@ def __init__(self, n_quantiles=1000, output_distribution='uniform', self.random_state = random_state self.copy = copy - def _compute_quantile_one_column(self, X_col, references, random_state): - """Private function to compute the quantiles for one features.""" + def _compute_quantiles_one_column(self, X_col, references, random_state): + """Private function to compute the quantiles for one feature.""" if self.smoothing_noise is not None: X_col = X_col + random_state.normal(0, self.smoothing_noise, size=X_col.shape) @@ -2076,7 +2077,7 @@ def _dense_fit(self, X, random_state): n_samples, n_features = X.shape # for compatibility issue with numpy<=1.8.X, references # need to be a list scaled between 0 and 100 - references = list(map(lambda x: x * 100, self.references_)) + references = (self.references_ * 100).tolist() self.quantiles_ = [] for col in X.T: if self.subsample < n_samples: @@ -2085,8 +2086,8 @@ def _dense_fit(self, X, random_state): random_state=random_state) col = col.take(subsample_idx, mode='clip') self.quantiles_.append( - self._compute_quantile_one_column(col, references, - random_state)) + self._compute_quantiles_one_column(col, references, + random_state)) self.quantiles_ = np.transpose(self.quantiles_) def _sparse_fit(self, X, random_state): @@ -2132,8 +2133,8 @@ def _sparse_fit(self, X, random_state): self.quantiles_.append([0] * len(references)) else: self.quantiles_.append( - self._compute_quantile_one_column(column_data, references, - random_state)) + self._compute_quantiles_one_column(column_data, references, + random_state)) self.quantiles_ = np.transpose(self.quantiles_) def fit(self, X, y=None): @@ -2152,9 +2153,6 @@ def fit(self, X, y=None): self : object Returns self """ - X = check_array(X, accept_sparse='csc') - rng = check_random_state(self.random_state) - if self.n_quantiles <= 0: raise ValueError("Invalid value for 'n_quantiles': %d. " "The number of quantiles must be at least one." @@ -2171,12 +2169,8 @@ def fit(self, X, y=None): "The noise std. dev. should be greater than " "0." % self.smoothing_noise) - # we only accept positive sparse matrix when ignore_implicit_zeros is - # false - if (not self.ignore_implicit_zeros and - (sparse.issparse(X) and np.any(X.data < 0))): - raise ValueError('QuantileTransformer only accepts non-negative' - ' sparse matrices') + X = self._check_inputs(X) + rng = check_random_state(self.random_state) # Create the quantiles of reference self.references_ = np.linspace(0, 1, self.n_quantiles, @@ -2279,7 +2273,6 @@ def _sparse_transform(self, X, inverse=False): X : sparse matrix CSC, shape (n_samples, n_features) Projected data. """ - for feature_idx in range(X.shape[1]): column_slice = slice(X.indptr[feature_idx], X.indptr[feature_idx + 1]) @@ -2288,31 +2281,35 @@ def _sparse_transform(self, X, inverse=False): return X - def _check_inputs_transform(self, X): - """Private function to check the inputs before transforming""" + def _check_inputs(self, X, accept_sparse_negative=False): + """Check inputs before fit and transform""" X = check_array(X, accept_sparse='csc', copy=self.copy, dtype=[np.float64, np.float32]) # we only accept positive sparse matrix when ignore_implicit_zeros is - # false - if (not self.ignore_implicit_zeros and + # false and that we call fit or transform. + if (not accept_sparse_negative and not self.ignore_implicit_zeros and (sparse.issparse(X) and np.any(X.data < 0))): raise ValueError('QuantileTransformer only accepts non-negative' - ' sparse matrices') - check_is_fitted(self, 'quantiles_') - # check that the dimension of X are adequate with the fitted data - if X.shape[1] != self.quantiles_.shape[1]: - raise ValueError('X does not have the same number of feature than' - ' the previously fitted data. Got {} instead of' - ' {}'.format(X.shape[1], - self.quantiles_.shape[1])) + ' sparse matrices.') + # check the output PDF if self.output_distribution not in ('normal', 'uniform'): - raise ValueError("'output_distribution' has to be either 'norm' or" - " 'uniform'. Got {} instead.".format( + raise ValueError("'output_distribution' has to be either 'normal'" + " or 'uniform'. Got '{}' instead.".format( self.output_distribution)) return X + def _check_is_fitted(self, X): + """Check the inputs before transforming""" + check_is_fitted(self, 'quantiles_') + # check that the dimension of X are adequate with the fitted data + if X.shape[1] != self.quantiles_.shape[1]: + raise ValueError('X does not have the same number of features as' + ' the previously fitted data. Got {} instead of' + ' {}.'.format(X.shape[1], + self.quantiles_.shape[1])) + def transform(self, X): """Feature-wise transformation of the data. @@ -2329,7 +2326,8 @@ def transform(self, X): Xt : ndarray or sparse matrix, shape (n_samples, n_features) The projected data. """ - X = self._check_inputs_transform(X) + X = self._check_inputs(X) + self._check_is_fitted(X) if sparse.issparse(X): return self._sparse_transform(X, inverse=False) @@ -2350,7 +2348,8 @@ def inverse_transform(self, X): Xt : ndarray or sparse matrix, shape (n_samples, n_features) The projected data. """ - X = self._check_inputs_transform(X) + X = self._check_inputs(X, accept_sparse_negative=True) + self._check_is_fitted(X) if sparse.issparse(X): return self._sparse_transform(X, inverse=True) @@ -2380,10 +2379,17 @@ def quantile_transform(X, axis=0, n_quantiles=1000, correlations between variables measured at the same scale but renders variables measured at different scales more directly comparable. - Read more in the :ref:`User Guide `. + Read more in the :ref:`User Guide `. Parameters ---------- + X : array-like, sparse matrix + The data to transform. + + axis : int, (default=0) + Axis used to compute the means and standard deviations along. If 0, + transform each feature, otherwise (if 1) transform each sample. + n_quantiles : int, optional (default=1000) Number of quantiles to be computed. It corresponds to the number of landmarks used to discretize the cumulative density function. @@ -2394,12 +2400,13 @@ def quantile_transform(X, axis=0, n_quantiles=1000, ignore_implicit_zeros : bool, optional (default=False) Only applies to sparse matrices. If True, the sparse entries of the - matrix are discarded to compute the quantile statistics. If false, + matrix are discarded to compute the quantile statistics. If False, these entries are treated as zeros. subsample : int, optional (default=1e5) Maximum number of samples used to estimate the quantiles for - computational efficiency. + computational efficiency. Note that the subsamplong procedure may + differ for value-identical sparse and dense matrices. smoothing_noise : float, optional Perturbs features at training time before computing quantiles by adding @@ -2415,8 +2422,8 @@ def quantile_transform(X, axis=0, n_quantiles=1000, noise. copy : boolean, optional, (default=True) - Set to False to perform inplace scaling and avoid a copy (if the input - is already a numpy array). + Set to False to perform inplace transformation and avoid a copy (if the + input is already a numpy array). Attributes ---------- @@ -2430,8 +2437,8 @@ def quantile_transform(X, axis=0, n_quantiles=1000, -------- >>> import numpy as np >>> from sklearn.preprocessing import quantile_transform - >>> RNG = np.random.RandomState(0) - >>> X = np.sort(RNG.normal(loc=0.5, scale=0.25, size=(25, 1)), axis=0) + >>> rng = np.random.RandomState(0) + >>> X = np.sort(rng.normal(loc=0.5, scale=0.25, size=(25, 1)), axis=0) >>> quantile_transform(X, n_quantiles=10, random_state=0) ... # doctest: +ELLIPSIS array([...]) diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index e1344b1583971..a480775221447 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -24,6 +24,7 @@ from sklearn.utils.testing import assert_less_equal from sklearn.utils.testing import assert_raises from sklearn.utils.testing import assert_raises_regex +from sklearn.utils.testing import assert_raise_message from sklearn.utils.testing import assert_true from sklearn.utils.testing import assert_false from sklearn.utils.testing import assert_warns_message @@ -856,66 +857,88 @@ def test_robust_scaler_iris_quantiles(): def test_quantile_transform_iris(): X = iris.data + # uniform output distribution transformer = QuantileTransformer(n_quantiles=30) X_trans = transformer.fit_transform(X) - assert_array_almost_equal(np.min(X_trans, axis=0), 0.) - assert_array_almost_equal(np.max(X_trans, axis=0), 1.) X_trans_inv = transformer.inverse_transform(X_trans) assert_array_almost_equal(X, X_trans_inv) + # normal output distribution + transformer = QuantileTransformer(n_quantiles=30, + output_distribution='normal') + X_trans = transformer.fit_transform(X) + X_trans_inv = transformer.inverse_transform(X_trans) + assert_array_almost_equal(X, X_trans_inv) + # make sure it is possible to take the inverse of a sparse matrix + # which contain negative value; this is the case in the iris dataset + X_sparse = sparse.csc_matrix(X) + X_sparse_tran = transformer.fit_transform(X_sparse) + X_sparse_tran_inv = transformer.inverse_transform(X_sparse_tran) + assert_array_almost_equal(X_sparse.A, X_sparse_tran_inv.A) def test_quantile_transform_check_error(): - X = np.array([[0, 25, 50, 0, 0, 0, 75, 0, 0, 100], - [2, 4, 0, 0, 6, 8, 0, 10, 0, 0], - [0, 0, 2.6, 4.1, 0, 0, 2.3, 0, 9.5, 0.1]]).T + X = np.transpose([[0, 25, 50, 0, 0, 0, 75, 0, 0, 100], + [2, 4, 0, 0, 6, 8, 0, 10, 0, 0], + [0, 0, 2.6, 4.1, 0, 0, 2.3, 0, 9.5, 0.1]]) X = sparse.csc_matrix(X) - X_neg = np.array([[0, 25, 50, 0, 0, 0, 75, 0, 0, 100], - [-2, 4, 0, 0, 6, 8, 0, 10, 0, 0], - [0, 0, 2.6, 4.1, 0, 0, 2.3, 0, 9.5, 0.1]]).T + X_neg = np.transpose([[0, 25, 50, 0, 0, 0, 75, 0, 0, 100], + [-2, 4, 0, 0, 6, 8, 0, 10, 0, 0], + [0, 0, 2.6, 4.1, 0, 0, 2.3, 0, 9.5, 0.1]]) X_neg = sparse.csc_matrix(X_neg) - assert_raises_regex(ValueError, "Invalid value for 'n_quantiles'", - QuantileTransformer(n_quantiles=0).fit, X_neg) - assert_raises_regex(ValueError, "Invalid value for 'subsample'", - QuantileTransformer(subsample=0).fit, X_neg) - assert_raises_regex(ValueError, "Invalid value for 'smoothing_noise'", - QuantileTransformer(smoothing_noise=0).fit, X_neg) + assert_raises_regex(ValueError, "Invalid value for 'n_quantiles': 0.", + QuantileTransformer(n_quantiles=0).fit, X) + assert_raises_regex(ValueError, "Invalid value for 'subsample': 0.", + QuantileTransformer(subsample=0).fit, X) + assert_raises_regex(ValueError, "Invalid value for 'smoothing_noise': 0.", + QuantileTransformer(smoothing_noise=0).fit, X) transformer = QuantileTransformer(n_quantiles=10) assert_raises_regex(ValueError, "QuantileTransformer only accepts " - "non-negative sparse matrices", transformer.fit, X_neg) + "non-negative sparse matrices.", + transformer.fit, X_neg) transformer.fit(X) assert_raises_regex(ValueError, "QuantileTransformer only accepts " - "non-negative sparse matrices", + "non-negative sparse matrices.", transformer.transform, X_neg) - assert_raises_regex(ValueError, "QuantileTransformer only accepts " - "non-negative sparse matrices", - transformer.inverse_transform, X_neg) - X_bad_feat = np.array([[0, 25, 50, 0, 0, 0, 75, 0, 0, 100], - [0, 0, 2.6, 4.1, 0, 0, 2.3, 0, 9.5, 0.1]]).T + X_bad_feat = np.transpose([[0, 25, 50, 0, 0, 0, 75, 0, 0, 100], + [0, 0, 2.6, 4.1, 0, 0, 2.3, 0, 9.5, 0.1]]) assert_raises_regex(ValueError, "X does not have the same number of " - "feature than the previously fitted data.", + "features as the previously fitted data. Got 2" + " instead of 3.", transformer.transform, X_bad_feat) assert_raises_regex(ValueError, "X does not have the same number of " - "feature than the previously fitted data.", + "features as the previously fitted data. Got 2" + " instead of 3.", transformer.inverse_transform, X_bad_feat) + transformer = QuantileTransformer(n_quantiles=10, + output_distribution='rnd') + # check that an error is raised at fit time assert_raises_regex(ValueError, "'output_distribution' has to be either" - " 'norm' or 'uniform'. Got rnd instead.", - QuantileTransformer( - n_quantiles=10, - output_distribution='rnd').fit_transform, X) + " 'normal' or 'uniform'. Got 'rnd' instead.", + transformer.fit, X) + # check that an error is raised at transform time + transformer.output_distribution = 'uniform' + transformer.fit(X) + X_tran = transformer.transform(X) + transformer.output_distribution = 'rnd' + assert_raises_regex(ValueError, "'output_distribution' has to be either" + " 'normal' or 'uniform'. Got 'rnd' instead.", + transformer.transform, X) + # check that an error is raised at inverse_transform time assert_raises_regex(ValueError, "'output_distribution' has to be either" - " 'norm' or 'uniform'. Got rnd instead.", - QuantileTransformer(n_quantiles=10, - output_distribution='rnd').fit( - X).inverse_transform, X) + " 'normal' or 'uniform'. Got 'rnd' instead.", + transformer.inverse_transform, X_tran) -def test_quantile_transform_ignore_zeros(): - X = np.array([[0, 0, 0, 0, 0], - [1, 0, 2, 2, 1]]).T +def test_quantile_transform_sparse_ignore_zeros(): + X = np.array([[0, 1], + [0, 0], + [0, 2], + [0, 2], + [0, 1]]) X_sparse = sparse.csc_matrix(X) transformer = QuantileTransformer(ignore_implicit_zeros=True, n_quantiles=5) @@ -925,10 +948,13 @@ def test_quantile_transform_ignore_zeros(): " only with sparse matrix. This parameter has no" " effect.", transformer.fit, X) - X_gt = np.array([[0, 0, 0, 0, 0], - [0, 0, 1, 1, 0]]).T + X_expected = np.array([[0, 0], + [0, 0], + [0, 1], + [0, 1], + [0, 0]]) X_trans = transformer.fit_transform(X_sparse) - assert_almost_equal(X_gt, X_trans.A) + assert_almost_equal(X_expected, X_trans.A) # consider the case where sparse entries are missing values and user-given # zeros are to be considered @@ -937,16 +963,16 @@ def test_quantile_transform_ignore_zeros(): X_row = np.array([0, 4, 0, 1, 2, 3, 4, 5, 6, 7, 8]) X_sparse = sparse.csc_matrix((X_data, (X_row, X_col))) X_trans = transformer.fit_transform(X_sparse) - X_gt = np.array([[0., 0.5], - [0., 0.], - [0., 1.], - [0., 1.], - [0., 0.5], - [0., 0.], - [0., 0.5], - [0., 1.], - [0., 0.]]) - assert_almost_equal(X_gt, X_trans.A) + X_expected = np.array([[0., 0.5], + [0., 0.], + [0., 1.], + [0., 1.], + [0., 0.5], + [0., 0.], + [0., 0.5], + [0., 1.], + [0., 0.]]) + assert_almost_equal(X_expected, X_trans.A) transformer = QuantileTransformer(ignore_implicit_zeros=True, n_quantiles=5) @@ -955,44 +981,62 @@ def test_quantile_transform_ignore_zeros(): X_row = np.array([0, 4, 0, 1, 2, 3, 4, 5, 6]) X_sparse = sparse.csc_matrix((X_data, (X_row, X_col))) X_trans = transformer.fit_transform(X_sparse) - X_gt = np.array([[0, 1], - [0, 0.5], - [0, 0.5], - [0, 0.5], - [0, 1], - [0, 0], - [0, 1]]) - assert_almost_equal(X_gt, X_trans.A) + X_expected = np.array([[0, 1], + [0, 0.5], + [0, 0.5], + [0, 0.5], + [0, 1], + [0, 0], + [0, 1]]) + assert_almost_equal(X_expected, X_trans.A) + assert_almost_equal(X_sparse.A, transformer.inverse_transform(X_trans).A) + + # check in conjunction with subsampling + transformer = QuantileTransformer(ignore_implicit_zeros=True, + n_quantiles=5, + subsample=8, + random_state=0) + X_trans = transformer.fit_transform(X_sparse) + assert_almost_equal(X_expected, X_trans.A) assert_almost_equal(X_sparse.A, transformer.inverse_transform(X_trans).A) def test_quantile_transform_dense_toy(): - X = np.array([[0, 25, 50, 75, 100], - [2, 4, 6, 8, 10], - [2.6, 4.1, 2.3, 9.5, 0.1]]).T + X = np.array([[0, 2, 2.6], + [25, 4, 4.1], + [50, 6, 2.3], + [75, 8, 9.5], + [100, 10, 0.1]]) transformer = QuantileTransformer(n_quantiles=5) transformer.fit(X) + # using the a uniform output, each entry of X should be map between 0 and 1 + # and equally spaced X_trans = transformer.fit_transform(X) - X_gt = np.tile(np.linspace(0, 1, num=5), (3, 1)).T - assert_almost_equal(np.sort(X_trans, axis=0), X_gt) + X_expected = np.tile(np.linspace(0, 1, num=5), (3, 1)).T + assert_almost_equal(np.sort(X_trans, axis=0), X_expected) X_test = np.array([ [-1, 1, 0], [101, 11, 10], ]) - expected = np.array([ + X_expected = np.array([ [0, 0, 0], [1, 1, 1], ]) - assert_array_almost_equal(transformer.transform(X_test), expected) + assert_array_almost_equal(transformer.transform(X_test), X_expected) X_trans_inv = transformer.inverse_transform(X_trans) assert_array_almost_equal(X, X_trans_inv) def test_quantile_transform_subsampling(): + # Test that subsampling the input yield to a consistent results We check + # that the computed quantiles are almost mapped to a [0, 1] vector where + # values are equally spaced. The infinite norm is checked to be smaller + # than a given threshold. This is repeated 5 times. + # dense support n_samples = 1000000 X = np.sort(np.random.sample((n_samples, 1)), axis=0) @@ -1001,7 +1045,7 @@ def test_quantile_transform_subsampling(): for random_state in range(ROUND): transformer = QuantileTransformer(random_state=random_state, n_quantiles=n_samples, - subsample=n_samples//10) + subsample=n_samples // 10) transformer.fit(X) diff = np.linspace(0, 1, n_samples) - np.ravel(transformer.quantiles_) inf_norm = np.max(np.abs(diff)) @@ -1020,7 +1064,7 @@ def test_quantile_transform_subsampling(): for random_state in range(ROUND): transformer = QuantileTransformer(random_state=random_state, n_quantiles=n_samples, - subsample=n_samples//10) + subsample=n_samples // 10) transformer.fit(X) diff = np.linspace(0, 1, n_samples) - np.ravel(transformer.quantiles_) inf_norm = np.max(np.abs(diff)) @@ -1033,9 +1077,16 @@ def test_quantile_transform_subsampling(): def test_quantile_transform_sparse_toy(): - X = np.array([[0, 25, 50, 0, 0, 0, 75, 0, 0, 100], - [2, 4, 0, 0, 6, 8, 0, 10, 0, 0], - [0, 0, 2.6, 4.1, 0, 0, 2.3, 0, 9.5, 0.1]]).T + X = np.array([[0., 2., 0.], + [25., 4., 0.], + [50., 0., 2.6], + [0., 0., 4.1], + [0., 6., 0.], + [0., 8., 0.], + [75., 0., 2.3], + [0., 10., 0.], + [0., 0., 9.5], + [100., 0., 0.1]]) X = sparse.csc_matrix(X) @@ -1070,6 +1121,8 @@ def test_quantile_transform_axis1(): def test_quantile_transform_bounds(): + # Lower and upper bounds are manually mapped. We checked that in the case + # of a constant feature and binary feature, the bounds are properly mapped. X_dense = np.array([[0, 0], [0, 0], [1, 0]]) @@ -1084,14 +1137,29 @@ def test_quantile_transform_bounds(): # check the consistency of the bounds by learning on 1 matrix # and transforming another - X = np.array([[0, 0, 1], - [1, 0.5, 0]]).T - X1 = np.array([[0, 0, 1], - [0.1, 0.5, 0.1]]).T + X = np.array([[0, 1], + [0, 0.5], + [1, 0]]) + X1 = np.array([[0, 0.1], + [0, 0.5], + [1, 0.1]]) transformer = QuantileTransformer(n_quantiles=3).fit(X) X_trans = transformer.transform(X1) assert_array_almost_equal(X_trans, X1) + # check that values outside of the range learned will be mapped properly. + X = np.random.random((1000, 1)) + transformer = QuantileTransformer() + transformer.fit(X) + assert_equal(transformer.transform(-10), transformer.transform(np.min(X))) + assert_equal(transformer.transform(10), transformer.transform(np.max(X))) + assert_equal(transformer.inverse_transform(-10), + transformer.inverse_transform( + np.min(transformer.references_))) + assert_equal(transformer.inverse_transform(10), + transformer.inverse_transform( + np.max(transformer.references_))) + def test_quantile_transform_add_noise_subsamples(): # toy examples From d4d6bb4b010047126a585c9463ee8436ed0b2224 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Sun, 7 May 2017 19:11:26 +0200 Subject: [PATCH 095/106] FIX inline sparse and dense transform --- sklearn/preprocessing/data.py | 94 ++++++++++++++--------------------- 1 file changed, 36 insertions(+), 58 deletions(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 4ed3f43a85b08..61948f897f2d2 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -2232,55 +2232,6 @@ def _transform_col(self, X_col, quantiles, inverse): return X_col - def _dense_transform(self, X, inverse=False): - """Forward and inverse transform for dense matrices. - - Parameters - ---------- - X : ndarray, shape (n_samples, n_features) - The data used to scale along the features axis. - - inverse : bool, optional (default=False) - If False, apply forward transform. If True, apply - inverse transform. - - Returns - ------- - X : ndarray, shape (n_samples, n_features) - Projected data - """ - for feature_idx in range(X.shape[1]): - X[:, feature_idx] = self._transform_col( - X[:, feature_idx], self.quantiles_[:, feature_idx], inverse) - - return X - - def _sparse_transform(self, X, inverse=False): - """Forward and inverse transform for sparse matrices. - - Parameters - ---------- - X : sparse matrix CSC, shape (n_samples, n_features) - The data used to scale along the features axis. The sparse matrix - needs to be nonnegative. - - inverse : bool, optional (default=False) - If False, apply forward transform. If True, apply - inverse transform. - - Returns - ------- - X : sparse matrix CSC, shape (n_samples, n_features) - Projected data. - """ - for feature_idx in range(X.shape[1]): - column_slice = slice(X.indptr[feature_idx], - X.indptr[feature_idx + 1]) - X.data[column_slice] = self._transform_col( - X.data[column_slice], self.quantiles_[:, feature_idx], inverse) - - return X - def _check_inputs(self, X, accept_sparse_negative=False): """Check inputs before fit and transform""" X = check_array(X, accept_sparse='csc', copy=self.copy, @@ -2308,7 +2259,40 @@ def _check_is_fitted(self, X): raise ValueError('X does not have the same number of features as' ' the previously fitted data. Got {} instead of' ' {}.'.format(X.shape[1], - self.quantiles_.shape[1])) + self.quantiles_.shape[1])) + + def _transform(self, X, inverse=False): + """Forward and inverse transform. + + Parameters + ---------- + X : ndarray, shape (n_samples, n_features) + The data used to scale along the features axis. + + inverse : bool, optional (default=False) + If False, apply forward transform. If True, apply + inverse transform. + + Returns + ------- + X : ndarray, shape (n_samples, n_features) + Projected data + """ + + if sparse.issparse(X): + for feature_idx in range(X.shape[1]): + column_slice = slice(X.indptr[feature_idx], + X.indptr[feature_idx + 1]) + X.data[column_slice] = self._transform_col( + X.data[column_slice], self.quantiles_[:, feature_idx], + inverse) + else: + for feature_idx in range(X.shape[1]): + X[:, feature_idx] = self._transform_col( + X[:, feature_idx], self.quantiles_[:, feature_idx], + inverse) + + return X def transform(self, X): """Feature-wise transformation of the data. @@ -2329,10 +2313,7 @@ def transform(self, X): X = self._check_inputs(X) self._check_is_fitted(X) - if sparse.issparse(X): - return self._sparse_transform(X, inverse=False) - else: - return self._dense_transform(X, inverse=False) + return self._transform(X, inverse=False) def inverse_transform(self, X): """Back-projection to the original space. @@ -2351,10 +2332,7 @@ def inverse_transform(self, X): X = self._check_inputs(X, accept_sparse_negative=True) self._check_is_fitted(X) - if sparse.issparse(X): - return self._sparse_transform(X, inverse=True) - else: - return self._dense_transform(X, inverse=True) + return self._transform(X, inverse=True) def quantile_transform(X, axis=0, n_quantiles=1000, From 0b5be040fe6bee63635fada38780d8fe62de3f84 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Sun, 7 May 2017 19:13:37 +0200 Subject: [PATCH 096/106] PEP8 --- sklearn/preprocessing/tests/test_data.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index a480775221447..20db5f7c99d66 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -24,7 +24,6 @@ from sklearn.utils.testing import assert_less_equal from sklearn.utils.testing import assert_raises from sklearn.utils.testing import assert_raises_regex -from sklearn.utils.testing import assert_raise_message from sklearn.utils.testing import assert_true from sklearn.utils.testing import assert_false from sklearn.utils.testing import assert_warns_message From c740628e4a50410e5d338003d52c0e1d4818e3dd Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Sun, 7 May 2017 20:01:35 +0200 Subject: [PATCH 097/106] TST/DOC temporary skipping test --- doc/modules/preprocessing.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index 5362149caeea6..51a71060d3766 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -282,16 +282,16 @@ a uniform distribution with values between 0 and 1:: >>> X_test_trans = quantile_transformer.transform(X_test) >>> np.percentile(X_train[:, 0], [0, 25, 50, 75, 100]) - ... # doctest: +ELLIPSIS + ... # doctest: +ELLIPSIS, +SKIP array([...]) >>> np.percentile(X_train_trans[:, 0], [0, 25, 50, 75, 100]) - ... # doctest: +ELLIPSIS + ... # doctest: +ELLIPSIS, +SKIP array([...]) >>> np.percentile(X_test[:, 0], [0, 25, 50, 75, 100]) - ... # doctest: +ELLIPSIS + ... # doctest: +ELLIPSIS, +SKIP array([...]) >>> np.percentile(X_test_trans[:, 0], [0, 25, 50, 75, 100]) - ... # doctest: +ELLIPSIS + ... # doctest: +ELLIPSIS, +SKIP array([...]) It is also possible to map the transformed data to a normal distribution by From 6c2d7cfc110d4e29ebf4e022f3c5141e496221ec Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 16 May 2017 23:46:04 +0200 Subject: [PATCH 098/106] FIX raise an error if n_quantiles > subsample --- sklearn/preprocessing/data.py | 6 ++++++ sklearn/preprocessing/tests/test_data.py | 16 +++++++++++----- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 61948f897f2d2..3a61dd1d98ebf 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -2163,6 +2163,12 @@ def fit(self, X, y=None): "The number of subsamples must be at least one." % self.subsample) + if self.n_quantiles > self.subsample: + raise ValueError("The number of quantiles cannot be greater than" + " the number of samples used. Got {} quantiles" + " and {} samples.".format(self.n_quantiles, + self.subsample)) + if self.smoothing_noise is not None: if self.smoothing_noise <= 0: raise ValueError("Invalid value for 'smoothing_noise': %d. " diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index 20db5f7c99d66..a21e0a8f57df1 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -889,6 +889,10 @@ def test_quantile_transform_check_error(): QuantileTransformer(n_quantiles=0).fit, X) assert_raises_regex(ValueError, "Invalid value for 'subsample': 0.", QuantileTransformer(subsample=0).fit, X) + assert_raises_regex(ValueError, "The number of quantiles cannot be" + " greater than the number of samples used. Got" + " 1000 quantiles and 10 samples.", + QuantileTransformer(subsample=10).fit, X) assert_raises_regex(ValueError, "Invalid value for 'smoothing_noise': 0.", QuantileTransformer(smoothing_noise=0).fit, X) @@ -1038,15 +1042,17 @@ def test_quantile_transform_subsampling(): # dense support n_samples = 1000000 + n_quantiles = 1000 X = np.sort(np.random.sample((n_samples, 1)), axis=0) ROUND = 5 inf_norm_arr = [] for random_state in range(ROUND): transformer = QuantileTransformer(random_state=random_state, - n_quantiles=n_samples, + n_quantiles=n_quantiles, subsample=n_samples // 10) transformer.fit(X) - diff = np.linspace(0, 1, n_samples) - np.ravel(transformer.quantiles_) + diff = (np.linspace(0, 1, n_quantiles) - + np.ravel(transformer.quantiles_)) inf_norm = np.max(np.abs(diff)) assert_true(inf_norm < 1e-2) inf_norm_arr.append(inf_norm) @@ -1062,12 +1068,12 @@ def test_quantile_transform_subsampling(): inf_norm_arr = [] for random_state in range(ROUND): transformer = QuantileTransformer(random_state=random_state, - n_quantiles=n_samples, + n_quantiles=n_quantiles, subsample=n_samples // 10) transformer.fit(X) - diff = np.linspace(0, 1, n_samples) - np.ravel(transformer.quantiles_) + diff = (np.linspace(0, 1, n_quantiles) - + np.ravel(transformer.quantiles_)) inf_norm = np.max(np.abs(diff)) - assert_true(inf_norm < 1e-1) inf_norm_arr.append(inf_norm) # each random subsampling yield a unique approximation to the expected From 22708c9fe982684a6a38bafcb11891e6696bc99e Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Tue, 6 Jun 2017 18:36:11 +0200 Subject: [PATCH 099/106] FIX wording in smoothing_noise example --- doc/modules/preprocessing.rst | 11 ++++++----- .../plot_smoothing_noise_quantile_transform.py | 5 ++--- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index 51a71060d3766..992a1d14322aa 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -308,12 +308,13 @@ normal output is clipped so that the input's minimum and maximum --- corresponding to the 1e-7 and 1 - 1e-7 quantiles respectively --- do not become infinite under the transformation. -:class:`QuantileTransformer` provides a ``smoothing_noise`` parameter to make -the interpretation more intuitive when manually checking the transformation -which particularly useful useful when feature values are replicated -(e.g. prices, units of time, etc.) rather than truly discrete or truly -continuous. See +:class:`QuantileTransformer` provides a ``smoothing_noise`` parameter to +make the interpretation more intuitive when inspecting the +transformation which is particularly useful when feature values are +replicated exactly many times in the training set (e.g. prices, ordinal +values such as user ratings, coarse-grained units of time, etc.). See :ref:`sphx_glr_auto_examples_preprocessing_plot_smoothing_noise_quantile_transform.py` +for more details. .. _preprocessing_normalization: diff --git a/examples/preprocessing/plot_smoothing_noise_quantile_transform.py b/examples/preprocessing/plot_smoothing_noise_quantile_transform.py index 7db05c6575281..5dae98635588f 100755 --- a/examples/preprocessing/plot_smoothing_noise_quantile_transform.py +++ b/examples/preprocessing/plot_smoothing_noise_quantile_transform.py @@ -7,9 +7,8 @@ The parameter ``smoothing_noise`` can be used if some specific feature values are repeated exactly many times to the point of being predominant in the -dataset which particularly useful useful when feature values are replicated -(e.g. prices, units of time, etc.) rather than truly discrete or truly -continuous. +dataset. This is can typically be observed when the feature encode ordinal +values such as user ratings, prices, coarse-grained units of time, etc. Without smoothing noise, the ``QuantileTransformer`` will map those values to some arbitrary value: the highest quantile value for all the inputs with the From 4d2fe634de1992d23f343fa11e6f0b956d47dd36 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 7 Jun 2017 11:28:00 +0200 Subject: [PATCH 100/106] EXA Denis comments --- examples/preprocessing/plot_all_scaling.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/examples/preprocessing/plot_all_scaling.py b/examples/preprocessing/plot_all_scaling.py index 7175e27dea2b4..221821c479a87 100755 --- a/examples/preprocessing/plot_all_scaling.py +++ b/examples/preprocessing/plot_all_scaling.py @@ -76,7 +76,7 @@ X = X_full[:, [0, 5]] -distributions = OrderedDict(( +distributions = [ ('Unscaled data', X), ('Data after standard scaling', StandardScaler().fit_transform(X)), @@ -93,8 +93,8 @@ QuantileTransformer(output_distribution='normal') .fit_transform(X)), ('Data after sample-wise L2 normalizing', - Normalizer().fit_transform(X)), -)) + Normalizer().fit_transform(X)) +] # scale the output between 0 and 1 for the colorbar y = minmax_scale(y_full) @@ -182,12 +182,13 @@ def plot_distribution(axes, X, y, hist_nbins=50, title="", def make_plot(item_idx): - _, X = list(distributions.items())[item_idx] + title, X = distributions[item_idx] ax_zoom_out, ax_zoom_in, ax_colorbar = create_axes() axarr = (ax_zoom_out, ax_zoom_in) plot_distribution(axarr[0], X, y, hist_nbins=200, x0_label="Median Income", - x1_label="Number of households") + x1_label="Number of households", + title=title) # zoom-in zoom_in_percentile_range = (0, 99) @@ -200,7 +201,8 @@ def make_plot(item_idx): plot_distribution(axarr[1], X[non_outliers_mask], y[non_outliers_mask], hist_nbins=50, x0_label="Median Income", - x1_label="Number of households") + x1_label="Number of households", + title="Zoom-in") norm = mpl.colors.Normalize(y_full.min(), y_full.max()) mpl.colorbar.ColorbarBase(ax_colorbar, cmap=cm.plasma_r, From 49c94b37e5210cc5978bc85379c12e2a0937cd1e Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 7 Jun 2017 11:29:56 +0200 Subject: [PATCH 101/106] FIX rephrasing --- doc/modules/preprocessing.rst | 2 +- .../preprocessing/plot_smoothing_noise_quantile_transform.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index 992a1d14322aa..a17fc21f3752e 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -311,7 +311,7 @@ become infinite under the transformation. :class:`QuantileTransformer` provides a ``smoothing_noise`` parameter to make the interpretation more intuitive when inspecting the transformation which is particularly useful when feature values are -replicated exactly many times in the training set (e.g. prices, ordinal +replicated identically many times in the training set (e.g. prices, ordinal values such as user ratings, coarse-grained units of time, etc.). See :ref:`sphx_glr_auto_examples_preprocessing_plot_smoothing_noise_quantile_transform.py` for more details. diff --git a/examples/preprocessing/plot_smoothing_noise_quantile_transform.py b/examples/preprocessing/plot_smoothing_noise_quantile_transform.py index 5dae98635588f..decd4539f0a57 100755 --- a/examples/preprocessing/plot_smoothing_noise_quantile_transform.py +++ b/examples/preprocessing/plot_smoothing_noise_quantile_transform.py @@ -6,7 +6,7 @@ ======================================================== The parameter ``smoothing_noise`` can be used if some specific feature values -are repeated exactly many times to the point of being predominant in the +are repeated identically many times to the point of being predominant in the dataset. This is can typically be observed when the feature encode ordinal values such as user ratings, prices, coarse-grained units of time, etc. From 2c85eb3e5cd528554116b63687a2c24e4ebb7580 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 7 Jun 2017 14:22:20 +0200 Subject: [PATCH 102/106] FIX make smoothing_noise to be a boolearn and change doc --- doc/modules/preprocessing.rst | 4 +-- examples/preprocessing/plot_all_scaling.py | 15 ++++------ ...plot_smoothing_noise_quantile_transform.py | 26 +++++++++-------- sklearn/preprocessing/data.py | 29 +++++++------------ sklearn/preprocessing/tests/test_data.py | 21 +++++++------- 5 files changed, 43 insertions(+), 52 deletions(-) diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index a17fc21f3752e..3b8b0628324e5 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -298,7 +298,7 @@ It is also possible to map the transformed data to a normal distribution by setting ``output_distribution='normal'``:: >>> quantile_transformer = preprocessing.QuantileTransformer( - ... smoothing_noise=1e-12, output_distribution='normal') + ... smoothing_noise=True, output_distribution='normal') >>> X_trans = quantile_transformer.fit_transform(X) >>> quantile_transformer.quantiles_ # doctest: +ELLIPSIS array([...]) @@ -310,7 +310,7 @@ become infinite under the transformation. :class:`QuantileTransformer` provides a ``smoothing_noise`` parameter to make the interpretation more intuitive when inspecting the -transformation which is particularly useful when feature values are +transformation. This is particularly useful when feature values are replicated identically many times in the training set (e.g. prices, ordinal values such as user ratings, coarse-grained units of time, etc.). See :ref:`sphx_glr_auto_examples_preprocessing_plot_smoothing_noise_quantile_transform.py` diff --git a/examples/preprocessing/plot_all_scaling.py b/examples/preprocessing/plot_all_scaling.py index 221821c479a87..d420a02afcc16 100755 --- a/examples/preprocessing/plot_all_scaling.py +++ b/examples/preprocessing/plot_all_scaling.py @@ -47,8 +47,6 @@ from __future__ import print_function -from collections import OrderedDict - import numpy as np import matplotlib as mpl @@ -100,8 +98,9 @@ y = minmax_scale(y_full) -def create_axes(figsize=(16, 6)): - plt.figure(figsize=figsize) +def create_axes(title, figsize=(16, 6)): + fig = plt.figure(figsize=figsize) + fig.suptitle(title) # define the axis for the first plot left, width = 0.1, 0.22 @@ -183,12 +182,11 @@ def plot_distribution(axes, X, y, hist_nbins=50, title="", def make_plot(item_idx): title, X = distributions[item_idx] - ax_zoom_out, ax_zoom_in, ax_colorbar = create_axes() + ax_zoom_out, ax_zoom_in, ax_colorbar = create_axes(title) axarr = (ax_zoom_out, ax_zoom_in) plot_distribution(axarr[0], X, y, hist_nbins=200, x0_label="Median Income", - x1_label="Number of households", - title=title) + x1_label="Number of households") # zoom-in zoom_in_percentile_range = (0, 99) @@ -201,8 +199,7 @@ def make_plot(item_idx): plot_distribution(axarr[1], X[non_outliers_mask], y[non_outliers_mask], hist_nbins=50, x0_label="Median Income", - x1_label="Number of households", - title="Zoom-in") + x1_label="Number of households") norm = mpl.colors.Normalize(y_full.min(), y_full.max()) mpl.colorbar.ColorbarBase(ax_colorbar, cmap=cm.plasma_r, diff --git a/examples/preprocessing/plot_smoothing_noise_quantile_transform.py b/examples/preprocessing/plot_smoothing_noise_quantile_transform.py index decd4539f0a57..f4ac17ddb795e 100755 --- a/examples/preprocessing/plot_smoothing_noise_quantile_transform.py +++ b/examples/preprocessing/plot_smoothing_noise_quantile_transform.py @@ -7,8 +7,9 @@ The parameter ``smoothing_noise`` can be used if some specific feature values are repeated identically many times to the point of being predominant in the -dataset. This is can typically be observed when the feature encode ordinal -values such as user ratings, prices, coarse-grained units of time, etc. +dataset. This is typically be observed when the feature encode ordinal +values such as user ratings, prices, coarse-grained units of time, etc. By +default, a small Gaussian noise is added during ``fit`` time. Without smoothing noise, the ``QuantileTransformer`` will map those values to some arbitrary value: the highest quantile value for all the inputs with the @@ -17,9 +18,9 @@ can lead to surprising results when manually inspecting the transformed values (e.g. for visualization or reporting). -The goal of the smoothing noise is to make it possible to map those repeated -values to some middle quantile value to make interpretation more intuitive as -demonstrated in the following. +The goal of the ``smoothing_noise`` is to make it possible to map those +repeated values to some middle quantile value to make interpretation more +intuitive as demonstrated in the following. """ @@ -67,8 +68,8 @@ def plot_transform_feat_val(ax, transformer, title): # a large number of customers attributed a grade of 3 to the current # restaurant. # -# By default, the ``QuantileTransformer`` does not apply any smoothing -# noise. When dealing with a data set with a predominant value, this feature +# The ``smoothing_noise`` can be disabled in ``QuantileTransformer``. +# When dealing with a data set with a predominant value, this feature # value can be affected to several quantiles. When provided to the transformer, # this feature value will be mapped to the largest quantile. In practice, # machine learning algorithms will usually not be affected by such @@ -81,9 +82,9 @@ def plot_transform_feat_val(ax, transformer, title): # figure. # # A solution is to apply a small smoothing noise before computing the -# quantiles. The parameter ``smoothing_noise`` offers this possibility as -# illustrated above. In this case, the marker is centered at the median as -# expected. +# quantiles. The parameter ``smoothing_noise=True`` (default behaviour) offers +# this possibility as illustrated above. In this case, the marker is centered +# at the median as expected. X = np.array([1] * 2000 + [2] * 1000 + @@ -94,12 +95,13 @@ def plot_transform_feat_val(ax, transformer, title): # create the subplots _, (ax1, ax2) = plt.subplots(1, 2) -qt = QuantileTransformer(n_quantiles=N_QUANTILES) +qt = QuantileTransformer(n_quantiles=N_QUANTILES, + smoothing_noise=False) qt.fit(X) plot_transform_feat_val(ax1, qt, 'Without smoothing') qt = QuantileTransformer(n_quantiles=N_QUANTILES, - smoothing_noise=1e-7) + smoothing_noise=True) qt.fit(X) plot_transform_feat_val(ax2, qt, 'With smoothing') plt.tight_layout() diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 3a61dd1d98ebf..66220d014a671 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -1990,11 +1990,10 @@ class QuantileTransformer(BaseEstimator, TransformerMixin): computational efficiency. Note that the subsamplong procedure may differ for value-identical sparse and dense matrices. - smoothing_noise : float, optional + smoothing_noise : bool, optional (default=True) Perturbs features at training time before computing quantiles by adding - Gaussian noise with standard deviation ``smoothing_noise``. It eases - the interpratation of the computed ``quantiles_`` when a particular - feature value is predominant. + Gaussian noise. It eases interpratation of the computed ``quantiles_`` + when a particular feature value is predominant. random_state : int, RandomState instance or None, optional (default=None) If int, random_state is the seed used by the random number generator; @@ -2045,7 +2044,7 @@ class QuantileTransformer(BaseEstimator, TransformerMixin): def __init__(self, n_quantiles=1000, output_distribution='uniform', ignore_implicit_zeros=False, subsample=int(1e5), - smoothing_noise=None, random_state=None, copy=True): + smoothing_noise=True, random_state=None, copy=True): self.n_quantiles = n_quantiles self.output_distribution = output_distribution self.ignore_implicit_zeros = ignore_implicit_zeros @@ -2056,9 +2055,8 @@ def __init__(self, n_quantiles=1000, output_distribution='uniform', def _compute_quantiles_one_column(self, X_col, references, random_state): """Private function to compute the quantiles for one feature.""" - if self.smoothing_noise is not None: - X_col = X_col + random_state.normal(0, self.smoothing_noise, - size=X_col.shape) + if self.smoothing_noise is True: + X_col = X_col + random_state.normal(0, 1e-7, size=X_col.shape) return np.percentile(X_col, references) @@ -2169,12 +2167,6 @@ def fit(self, X, y=None): " and {} samples.".format(self.n_quantiles, self.subsample)) - if self.smoothing_noise is not None: - if self.smoothing_noise <= 0: - raise ValueError("Invalid value for 'smoothing_noise': %d. " - "The noise std. dev. should be greater than " - "0." % self.smoothing_noise) - X = self._check_inputs(X) rng = check_random_state(self.random_state) @@ -2345,7 +2337,7 @@ def quantile_transform(X, axis=0, n_quantiles=1000, output_distribution='uniform', ignore_implicit_zeros=False, subsample=int(1e5), - smoothing_noise=None, + smoothing_noise=True, random_state=None, copy=False): """Transform features using quantiles information. @@ -2392,11 +2384,10 @@ def quantile_transform(X, axis=0, n_quantiles=1000, computational efficiency. Note that the subsamplong procedure may differ for value-identical sparse and dense matrices. - smoothing_noise : float, optional + smoothing_noise : bool, optional (default=True) Perturbs features at training time before computing quantiles by adding - Gaussian noise with standard deviation ``smoothing_noise``. It eases - the interpratation of the computed ``quantiles_`` when a particular - feature value is predominant. + Gaussian noise. It eases interpratation of the computed ``quantiles_`` + when a particular feature value is predominant. random_state : int, RandomState instance or None, optional (default=None) If int, random_state is the seed used by the random number generator; diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index a21e0a8f57df1..43af3ffdbf42f 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -893,8 +893,6 @@ def test_quantile_transform_check_error(): " greater than the number of samples used. Got" " 1000 quantiles and 10 samples.", QuantileTransformer(subsample=10).fit, X) - assert_raises_regex(ValueError, "Invalid value for 'smoothing_noise': 0.", - QuantileTransformer(smoothing_noise=0).fit, X) transformer = QuantileTransformer(n_quantiles=10) assert_raises_regex(ValueError, "QuantileTransformer only accepts " @@ -944,7 +942,7 @@ def test_quantile_transform_sparse_ignore_zeros(): [0, 1]]) X_sparse = sparse.csc_matrix(X) transformer = QuantileTransformer(ignore_implicit_zeros=True, - n_quantiles=5) + n_quantiles=5, smoothing_noise=False) # dense case -> warning raise assert_warns_message(UserWarning, "'ignore_implicit_zeros' takes effect" @@ -978,7 +976,7 @@ def test_quantile_transform_sparse_ignore_zeros(): assert_almost_equal(X_expected, X_trans.A) transformer = QuantileTransformer(ignore_implicit_zeros=True, - n_quantiles=5) + n_quantiles=5, smoothing_noise=False) X_data = np.array([-1, -1, 1, 0, 0, 0, 1, -1, 1]) X_col = np.array([0, 0, 1, 1, 1, 1, 1, 1, 1]) X_row = np.array([0, 4, 0, 1, 2, 3, 4, 5, 6]) @@ -998,7 +996,8 @@ def test_quantile_transform_sparse_ignore_zeros(): transformer = QuantileTransformer(ignore_implicit_zeros=True, n_quantiles=5, subsample=8, - random_state=0) + random_state=0, + smoothing_noise=False) X_trans = transformer.fit_transform(X_sparse) assert_almost_equal(X_expected, X_trans.A) assert_almost_equal(X_sparse.A, transformer.inverse_transform(X_trans).A) @@ -1134,9 +1133,11 @@ def test_quantile_transform_bounds(): X_sparse = sparse.csc_matrix(X_dense) # check sparse and dense are consistent - X_trans = QuantileTransformer(n_quantiles=3).fit_transform(X_dense) + X_trans = QuantileTransformer( + n_quantiles=3, smoothing_noise=False).fit_transform(X_dense) assert_array_almost_equal(X_trans, X_dense) - X_trans_sp = QuantileTransformer(n_quantiles=3).fit_transform(X_sparse) + X_trans_sp = QuantileTransformer( + n_quantiles=3, smoothing_noise=False).fit_transform(X_sparse) assert_array_almost_equal(X_trans_sp.A, X_dense) assert_array_almost_equal(X_trans, X_trans_sp.A) @@ -1172,7 +1173,7 @@ def test_quantile_transform_add_noise_subsamples(): X = np.transpose([[unique_feature[0]] * 1 + [unique_feature[1]] * 7 + [unique_feature[2]] * 2]) - transformer = QuantileTransformer(n_quantiles=100, smoothing_noise=1e-7, + transformer = QuantileTransformer(n_quantiles=100, smoothing_noise=True, random_state=0) transformer.fit(X) # check that the feature values associated to quantiles are strictly @@ -1181,7 +1182,7 @@ def test_quantile_transform_add_noise_subsamples(): map(assert_greater, diff_quantiles, [0] * len(diff_quantiles)) # iris dataset X = iris.data - transformer = QuantileTransformer(n_quantiles=1000, smoothing_noise=1e-7, + transformer = QuantileTransformer(n_quantiles=1000, smoothing_noise=True, random_state=0) X_trans = transformer.fit_transform(X) X_trans_inv = transformer.inverse_transform(X_trans) @@ -1208,7 +1209,7 @@ def test_quantile_transform_numpy_interp_behaviour(): X = np.transpose([[unique_feature[0]] * 1 + [unique_feature[1]] * 7 + [unique_feature[2]] * 2]) - qt = QuantileTransformer(n_quantiles=100) + qt = QuantileTransformer(n_quantiles=100, smoothing_noise=False) qt.fit(X) ref = np.linspace(0., 1., num=qt.n_quantiles) max_quantiles_idx = [np.flatnonzero(qt.quantiles_ == unique_feature[i])[-1] From db08c55bfe85d35588b23bb210232cf427d282c1 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 7 Jun 2017 18:59:22 +0200 Subject: [PATCH 103/106] FIX address comments --- doc/modules/preprocessing.rst | 40 +++++++++++----------- examples/preprocessing/plot_all_scaling.py | 6 ++-- sklearn/preprocessing/data.py | 15 ++++---- sklearn/preprocessing/tests/test_data.py | 28 +++++++++------ 4 files changed, 48 insertions(+), 41 deletions(-) diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index 3b8b0628324e5..a58a61a649a28 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -275,44 +275,44 @@ a uniform distribution with values between 0 and 1:: >>> from sklearn.model_selection import train_test_split >>> iris = load_iris() >>> X, y = iris.data, iris.target - >>> X_train, X_test, y_train, y_test = train_test_split(X, y) - >>> quantile_transformer = preprocessing.QuantileTransformer( - ... smoothing_noise=1e-12) + >>> X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) + >>> quantile_transformer = preprocessing.QuantileTransformer(random_state=0) >>> X_train_trans = quantile_transformer.fit_transform(X_train) >>> X_test_trans = quantile_transformer.transform(X_test) - >>> np.percentile(X_train[:, 0], [0, 25, 50, 75, 100]) - ... # doctest: +ELLIPSIS, +SKIP - array([...]) + array([ 4.3, 5.1, 5.8, 6.5, 7.9]) >>> np.percentile(X_train_trans[:, 0], [0, 25, 50, 75, 100]) - ... # doctest: +ELLIPSIS, +SKIP - array([...]) + array([ 0.0000001 , 0.24608042, 0.49100792, 0.73162701, 0.9999999 ]) >>> np.percentile(X_test[:, 0], [0, 25, 50, 75, 100]) - ... # doctest: +ELLIPSIS, +SKIP - array([...]) + array([ 4.4 , 5.125, 5.75 , 6.175, 7.3 ]) >>> np.percentile(X_test_trans[:, 0], [0, 25, 50, 75, 100]) - ... # doctest: +ELLIPSIS, +SKIP - array([...]) + array([ 0.01801802, 0.25653626, 0.46157383, 0.6081081 , 0.94144144]) It is also possible to map the transformed data to a normal distribution by setting ``output_distribution='normal'``:: >>> quantile_transformer = preprocessing.QuantileTransformer( - ... smoothing_noise=True, output_distribution='normal') + ... output_distribution='normal', random_state=0) >>> X_trans = quantile_transformer.fit_transform(X) - >>> quantile_transformer.quantiles_ # doctest: +ELLIPSIS - array([...]) + >>> quantile_transformer.quantiles_ # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE + array([[ 4.30000001, 2.00000009, 1.00000003, 0.09999991], + [ 4.3149149 , 2.02982991, 1.01491493, 0.09999992], + [ 4.32982979, 2.05965973, 1.02982983, 0.09999992], + ..., + [ 7.8403404 , 4.34034033, 6.84034045, 2.50000003], + [ 7.87017023, 4.37017021, 6.8701703 , 2.50000003], + [ 7.90000005, 4.40000008, 6.90000015, 2.50000004]]) Thus the median of the input becomes the mean of the output, centered at 0. The normal output is clipped so that the input's minimum and maximum --- corresponding to the 1e-7 and 1 - 1e-7 quantiles respectively --- do not become infinite under the transformation. -:class:`QuantileTransformer` provides a ``smoothing_noise`` parameter to -make the interpretation more intuitive when inspecting the -transformation. This is particularly useful when feature values are -replicated identically many times in the training set (e.g. prices, ordinal -values such as user ratings, coarse-grained units of time, etc.). See +:class:`QuantileTransformer` provides a ``smoothing_noise`` parameter (set to +True by default) to make the interpretation more intuitive when inspecting the +transformation. This is particularly useful when feature values are replicated +identically many times in the training set (e.g. prices, ordinal values such as +user ratings, coarse-grained units of time, etc.). See :ref:`sphx_glr_auto_examples_preprocessing_plot_smoothing_noise_quantile_transform.py` for more details. diff --git a/examples/preprocessing/plot_all_scaling.py b/examples/preprocessing/plot_all_scaling.py index d420a02afcc16..677386a00191c 100755 --- a/examples/preprocessing/plot_all_scaling.py +++ b/examples/preprocessing/plot_all_scaling.py @@ -186,7 +186,8 @@ def make_plot(item_idx): axarr = (ax_zoom_out, ax_zoom_in) plot_distribution(axarr[0], X, y, hist_nbins=200, x0_label="Median Income", - x1_label="Number of households") + x1_label="Number of households", + title="Full data") # zoom-in zoom_in_percentile_range = (0, 99) @@ -199,7 +200,8 @@ def make_plot(item_idx): plot_distribution(axarr[1], X[non_outliers_mask], y[non_outliers_mask], hist_nbins=50, x0_label="Median Income", - x1_label="Number of households") + x1_label="Number of households", + title="Zoom-in") norm = mpl.colors.Normalize(y_full.min(), y_full.max()) mpl.colorbar.ColorbarBase(ax_colorbar, cmap=cm.plasma_r, diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index aee22ac1fed30..46d19dc580533 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -1986,7 +1986,7 @@ class QuantileTransformer(BaseEstimator, TransformerMixin): subsample : int, optional (default=1e5) Maximum number of samples used to estimate the quantiles for - computational efficiency. Note that the subsamplong procedure may + computational efficiency. Note that the subsampling procedure may differ for value-identical sparse and dense matrices. smoothing_noise : bool, optional (default=True) @@ -2078,9 +2078,9 @@ def _dense_fit(self, X, random_state): self.quantiles_ = [] for col in X.T: if self.subsample < n_samples: - subsample_idx = choice(n_samples, size=self.subsample, - replace=False, - random_state=random_state) + subsample_idx = random_state.choice(n_samples, + size=self.subsample, + replace=False) col = col.take(subsample_idx, mode='clip') self.quantiles_.append( self._compute_quantiles_one_column(col, references, @@ -2113,9 +2113,8 @@ def _sparse_fit(self, X, random_state): dtype=X.dtype) else: column_data = np.zeros(shape=self.subsample, dtype=X.dtype) - column_data[:column_subsample] = choice( - column_nnz_data, size=column_subsample, - replace=False, random_state=random_state) + column_data[:column_subsample] = random_state.choice( + column_nnz_data, size=column_subsample, replace=False) else: if self.ignore_implicit_zeros: column_data = np.zeros(shape=len(column_nnz_data), @@ -2380,7 +2379,7 @@ def quantile_transform(X, axis=0, n_quantiles=1000, subsample : int, optional (default=1e5) Maximum number of samples used to estimate the quantiles for - computational efficiency. Note that the subsamplong procedure may + computational efficiency. Note that the subsampling procedure may differ for value-identical sparse and dense matrices. smoothing_noise : bool, optional (default=True) diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index 43af3ffdbf42f..5f2b5d7bc70cf 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -857,13 +857,14 @@ def test_robust_scaler_iris_quantiles(): def test_quantile_transform_iris(): X = iris.data # uniform output distribution - transformer = QuantileTransformer(n_quantiles=30) + transformer = QuantileTransformer(n_quantiles=30, smoothing_noise=False) X_trans = transformer.fit_transform(X) X_trans_inv = transformer.inverse_transform(X_trans) assert_array_almost_equal(X, X_trans_inv) # normal output distribution transformer = QuantileTransformer(n_quantiles=30, - output_distribution='normal') + output_distribution='normal', + smoothing_noise=False) X_trans = transformer.fit_transform(X) X_trans_inv = transformer.inverse_transform(X_trans) assert_array_almost_equal(X, X_trans_inv) @@ -1010,7 +1011,7 @@ def test_quantile_transform_dense_toy(): [75, 8, 9.5], [100, 10, 0.1]]) - transformer = QuantileTransformer(n_quantiles=5) + transformer = QuantileTransformer(n_quantiles=5, smoothing_noise=False) transformer.fit(X) # using the a uniform output, each entry of X should be map between 0 and 1 @@ -1094,7 +1095,7 @@ def test_quantile_transform_sparse_toy(): X = sparse.csc_matrix(X) - transformer = QuantileTransformer(n_quantiles=10) + transformer = QuantileTransformer(n_quantiles=10, smoothing_noise=False) transformer.fit(X) X_trans = transformer.fit_transform(X) @@ -1104,7 +1105,9 @@ def test_quantile_transform_sparse_toy(): X_trans_inv = transformer.inverse_transform(X_trans) assert_array_almost_equal(X.toarray(), X_trans_inv.toarray()) - transformer_dense = QuantileTransformer(n_quantiles=10).fit(X.toarray()) + transformer_dense = QuantileTransformer(n_quantiles=10, + smoothing_noise=False).fit( + X.toarray()) X_trans = transformer_dense.transform(X) assert_array_almost_equal(np.min(X_trans.toarray(), axis=0), 0.) @@ -1133,11 +1136,13 @@ def test_quantile_transform_bounds(): X_sparse = sparse.csc_matrix(X_dense) # check sparse and dense are consistent - X_trans = QuantileTransformer( - n_quantiles=3, smoothing_noise=False).fit_transform(X_dense) + X_trans = QuantileTransformer(n_quantiles=3, + smoothing_noise=False, + random_state=0).fit_transform(X_dense) assert_array_almost_equal(X_trans, X_dense) - X_trans_sp = QuantileTransformer( - n_quantiles=3, smoothing_noise=False).fit_transform(X_sparse) + X_trans_sp = QuantileTransformer(n_quantiles=3, + smoothing_noise=False, + random_state=0).fit_transform(X_sparse) assert_array_almost_equal(X_trans_sp.A, X_dense) assert_array_almost_equal(X_trans, X_trans_sp.A) @@ -1149,13 +1154,14 @@ def test_quantile_transform_bounds(): X1 = np.array([[0, 0.1], [0, 0.5], [1, 0.1]]) - transformer = QuantileTransformer(n_quantiles=3).fit(X) + transformer = QuantileTransformer(n_quantiles=3, + smoothing_noise=False).fit(X) X_trans = transformer.transform(X1) assert_array_almost_equal(X_trans, X1) # check that values outside of the range learned will be mapped properly. X = np.random.random((1000, 1)) - transformer = QuantileTransformer() + transformer = QuantileTransformer(smoothing_noise=False) transformer.fit(X) assert_equal(transformer.transform(-10), transformer.transform(np.min(X))) assert_equal(transformer.transform(10), transformer.transform(np.max(X))) From be207c7b62a070a3aa82f34d6fd708482b2fd317 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 8 Jun 2017 01:07:00 +0200 Subject: [PATCH 104/106] FIX verbose the doc slightly more --- doc/modules/preprocessing.rst | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index a58a61a649a28..b3b0659665270 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -281,8 +281,16 @@ a uniform distribution with values between 0 and 1:: >>> X_test_trans = quantile_transformer.transform(X_test) >>> np.percentile(X_train[:, 0], [0, 25, 50, 75, 100]) array([ 4.3, 5.1, 5.8, 6.5, 7.9]) + +This feature corresponds to the sepal length in cm. Once the quantile +transformation applied, those landmarks approach closely the percentiles +previously defined:: + >>> np.percentile(X_train_trans[:, 0], [0, 25, 50, 75, 100]) array([ 0.0000001 , 0.24608042, 0.49100792, 0.73162701, 0.9999999 ]) + +This can be confirmed on a independent testing set with similar remarks:: + >>> np.percentile(X_test[:, 0], [0, 25, 50, 75, 100]) array([ 4.4 , 5.125, 5.75 , 6.175, 7.3 ]) >>> np.percentile(X_test_trans[:, 0], [0, 25, 50, 75, 100]) From 7b17f14d2ad64201738ad1c270bfc7388f2ba5b0 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 8 Jun 2017 18:14:18 +0200 Subject: [PATCH 105/106] PEP8/DOC --- doc/modules/preprocessing.rst | 19 +++++++++++-------- sklearn/preprocessing/data.py | 2 -- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index b3b0659665270..fd7089faf346d 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -287,14 +287,17 @@ transformation applied, those landmarks approach closely the percentiles previously defined:: >>> np.percentile(X_train_trans[:, 0], [0, 25, 50, 75, 100]) - array([ 0.0000001 , 0.24608042, 0.49100792, 0.73162701, 0.9999999 ]) + ... # doctest: +ELLIPSIS +SKIP + array([ 0.00... , 0.24..., 0.49..., 0.73..., 0.99... ]) This can be confirmed on a independent testing set with similar remarks:: >>> np.percentile(X_test[:, 0], [0, 25, 50, 75, 100]) + ... # doctest: +SKIP array([ 4.4 , 5.125, 5.75 , 6.175, 7.3 ]) >>> np.percentile(X_test_trans[:, 0], [0, 25, 50, 75, 100]) - array([ 0.01801802, 0.25653626, 0.46157383, 0.6081081 , 0.94144144]) + ... # doctest: +ELLIPSIS +SKIP + array([ 0.01..., 0.25..., 0.46..., 0.60... , 0.94...]) It is also possible to map the transformed data to a normal distribution by setting ``output_distribution='normal'``:: @@ -303,13 +306,13 @@ setting ``output_distribution='normal'``:: ... output_distribution='normal', random_state=0) >>> X_trans = quantile_transformer.fit_transform(X) >>> quantile_transformer.quantiles_ # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE - array([[ 4.30000001, 2.00000009, 1.00000003, 0.09999991], - [ 4.3149149 , 2.02982991, 1.01491493, 0.09999992], - [ 4.32982979, 2.05965973, 1.02982983, 0.09999992], + array([[ 4.30..., 2.00..., 1.00..., 0.09...], + [ 4.31... , 2.02..., 1.01..., 0.09...], + [ 4.32..., 2.05..., 1.02..., 0.09...], ..., - [ 7.8403404 , 4.34034033, 6.84034045, 2.50000003], - [ 7.87017023, 4.37017021, 6.8701703 , 2.50000003], - [ 7.90000005, 4.40000008, 6.90000015, 2.50000004]]) + [ 7.84... , 4.34..., 6.84..., 2.50...], + [ 7.87..., 4.37..., 6.87... , 2.50...], + [ 7.90..., 4.40..., 6.90..., 2.50...]]) Thus the median of the input becomes the mean of the output, centered at 0. The normal output is clipped so that the input's minimum and maximum --- diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 46d19dc580533..cd17616da11f4 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -29,8 +29,6 @@ min_max_axis) from ..utils.validation import (check_is_fitted, check_random_state, FLOAT_DTYPES) -from ..utils.random import choice - BOUNDS_THRESHOLD = 1e-7 From 7046a6d2f0eb1893c6d934750861850797cdf002 Mon Sep 17 00:00:00 2001 From: Gael Varoquaux Date: Fri, 9 Jun 2017 10:57:34 +0200 Subject: [PATCH 106/106] ENH: 2-ways interpolation to avoid smoothing_noise Simplifies also the code, examples, and documentation --- build_tools/travis/flake8_diff.sh | 4 +- doc/modules/preprocessing.rst | 22 ++-- doc/whats_new.rst | 2 +- ...plot_smoothing_noise_quantile_transform.py | 108 ------------------ sklearn/preprocessing/data.py | 46 +++----- sklearn/preprocessing/tests/test_data.py | 88 +++----------- 6 files changed, 42 insertions(+), 228 deletions(-) delete mode 100755 examples/preprocessing/plot_smoothing_noise_quantile_transform.py diff --git a/build_tools/travis/flake8_diff.sh b/build_tools/travis/flake8_diff.sh index 87ffdffd345ce..cf3dcb5577e9c 100755 --- a/build_tools/travis/flake8_diff.sh +++ b/build_tools/travis/flake8_diff.sh @@ -137,8 +137,8 @@ check_files() { if [[ "$MODIFIED_FILES" == "no_match" ]]; then echo "No file outside sklearn/externals and doc/sphinxext/sphinx_gallery has been modified" else - check_files "$(echo "$MODIFIED_FILES" | grep -v ^examples)" + check_files "$(echo "$MODIFIED_FILES" | grep -v ^examples)" --ignore=W503 # Examples are allowed to not have imports at top of file - check_files "$(echo "$MODIFIED_FILES" | grep ^examples)" --ignore=E402 + check_files "$(echo "$MODIFIED_FILES" | grep ^examples)" --ignore=E402,W503 fi echo -e "No problem detected by flake8\n" diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index fd7089faf346d..3b75eed6a7ff2 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -279,7 +279,7 @@ a uniform distribution with values between 0 and 1:: >>> quantile_transformer = preprocessing.QuantileTransformer(random_state=0) >>> X_train_trans = quantile_transformer.fit_transform(X_train) >>> X_test_trans = quantile_transformer.transform(X_test) - >>> np.percentile(X_train[:, 0], [0, 25, 50, 75, 100]) + >>> np.percentile(X_train[:, 0], [0, 25, 50, 75, 100]) # doctest: +SKIP array([ 4.3, 5.1, 5.8, 6.5, 7.9]) This feature corresponds to the sepal length in cm. Once the quantile @@ -306,27 +306,19 @@ setting ``output_distribution='normal'``:: ... output_distribution='normal', random_state=0) >>> X_trans = quantile_transformer.fit_transform(X) >>> quantile_transformer.quantiles_ # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE - array([[ 4.30..., 2.00..., 1.00..., 0.09...], - [ 4.31... , 2.02..., 1.01..., 0.09...], - [ 4.32..., 2.05..., 1.02..., 0.09...], + array([[ 4.3..., 2..., 1..., 0.1...], + [ 4.31..., 2.02..., 1.01..., 0.1...], + [ 4.32..., 2.05..., 1.02..., 0.1...], ..., - [ 7.84... , 4.34..., 6.84..., 2.50...], - [ 7.87..., 4.37..., 6.87... , 2.50...], - [ 7.90..., 4.40..., 6.90..., 2.50...]]) + [ 7.84..., 4.34..., 6.84..., 2.5...], + [ 7.87..., 4.37..., 6.87..., 2.5...], + [ 7.9..., 4.4..., 6.9..., 2.5...]]) Thus the median of the input becomes the mean of the output, centered at 0. The normal output is clipped so that the input's minimum and maximum --- corresponding to the 1e-7 and 1 - 1e-7 quantiles respectively --- do not become infinite under the transformation. -:class:`QuantileTransformer` provides a ``smoothing_noise`` parameter (set to -True by default) to make the interpretation more intuitive when inspecting the -transformation. This is particularly useful when feature values are replicated -identically many times in the training set (e.g. prices, ordinal values such as -user ratings, coarse-grained units of time, etc.). See -:ref:`sphx_glr_auto_examples_preprocessing_plot_smoothing_noise_quantile_transform.py` -for more details. - .. _preprocessing_normalization: Normalization diff --git a/doc/whats_new.rst b/doc/whats_new.rst index e140bffc46ae5..022e8411a6edf 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -62,7 +62,7 @@ New features normalization based on quantiles. :issue:`8363` by :user:`Denis Engemann `, :user:`Guillaume Lemaitre `, `Olivier Grisel`_, `Raghav RV`_, - and :user:`Thierry Guillemot `. + :user:`Thierry Guillemot `_, and `Gael Varoquaux`_. Enhancements ............ diff --git a/examples/preprocessing/plot_smoothing_noise_quantile_transform.py b/examples/preprocessing/plot_smoothing_noise_quantile_transform.py deleted file mode 100755 index f4ac17ddb795e..0000000000000 --- a/examples/preprocessing/plot_smoothing_noise_quantile_transform.py +++ /dev/null @@ -1,108 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -""" -======================================================== -Effect of smoothing noise when using QuantileTransformer -======================================================== - -The parameter ``smoothing_noise`` can be used if some specific feature values -are repeated identically many times to the point of being predominant in the -dataset. This is typically be observed when the feature encode ordinal -values such as user ratings, prices, coarse-grained units of time, etc. By -default, a small Gaussian noise is added during ``fit`` time. - -Without smoothing noise, the ``QuantileTransformer`` will map those values to -some arbitrary value: the highest quantile value for all the inputs with the -same value. While this is usually not an issue when ``QuantileTransformer`` is -used as a preprocessing transformer for a subsequent supervised estimator, it -can lead to surprising results when manually inspecting the transformed values -(e.g. for visualization or reporting). - -The goal of the ``smoothing_noise`` is to make it possible to map those -repeated values to some middle quantile value to make interpretation more -intuitive as demonstrated in the following. - -""" - -# Author: Guillaume Lemaitre -# License: BSD 3 clause - -import numpy as np -import matplotlib.pyplot as plt - -from sklearn.preprocessing import QuantileTransformer - -print(__doc__) - -N_QUANTILES = 1000 -FEAT_VAL = 3.0 - - -def plot_transform_feat_val(ax, transformer, title): - """Plot the mapping function as well as a specific feature value.""" - ref = np.linspace(0, 1, num=N_QUANTILES) - - ax.plot(transformer.quantiles_, ref) - ax.scatter(FEAT_VAL, transformer.transform(FEAT_VAL), c='r', - label=r'$f({0}) = {1:.2f}$'.format( - FEAT_VAL, - np.ravel(transformer.transform(FEAT_VAL))[0])) - ax.set_xlabel('Feature values') - ax.set_ylabel('Quantiles in %') - ax.set_title(title) - ax.legend(loc=4) - # make nice axis layout - ax.spines['top'].set_visible(False) - ax.spines['right'].set_visible(False) - ax.get_xaxis().tick_bottom() - ax.get_yaxis().tick_left() - ax.set_xlim([1, 5.1]) - ax.set_ylim([0, 1]) - ax.spines['left'].set_position(('outward', 10)) - ax.spines['bottom'].set_position(('outward', 10)) - - -############################################################################### -# We can create a synthetic dataset representing the customers' -# ratings for a restaurant. The scale used is ranging from 1 to 5 and -# a large number of customers attributed a grade of 3 to the current -# restaurant. -# -# The ``smoothing_noise`` can be disabled in ``QuantileTransformer``. -# When dealing with a data set with a predominant value, this feature -# value can be affected to several quantiles. When provided to the transformer, -# this feature value will be mapped to the largest quantile. In practice, -# machine learning algorithms will usually not be affected by such -# characteristics. However, manual interpretation might be counter intuitive. -# -# From the below plot, we would expect that a vote corresponding to -# the value 3 would be mapped to the median (e.g., 0.5). However, the -# default behaviour of the 'interp' numpy function will map this -# feature value to the greater quantile as shown by the marker in the -# figure. -# -# A solution is to apply a small smoothing noise before computing the -# quantiles. The parameter ``smoothing_noise=True`` (default behaviour) offers -# this possibility as illustrated above. In this case, the marker is centered -# at the median as expected. - -X = np.array([1] * 2000 + - [2] * 1000 + - [3] * 7000 + - [4] * 2000 + - [5] * 1000).reshape(-1, 1) - -# create the subplots -_, (ax1, ax2) = plt.subplots(1, 2) - -qt = QuantileTransformer(n_quantiles=N_QUANTILES, - smoothing_noise=False) -qt.fit(X) -plot_transform_feat_val(ax1, qt, 'Without smoothing') - -qt = QuantileTransformer(n_quantiles=N_QUANTILES, - smoothing_noise=True) -qt.fit(X) -plot_transform_feat_val(ax2, qt, 'With smoothing') -plt.tight_layout() -plt.show() diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index cd17616da11f4..107656702bad9 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -1987,11 +1987,6 @@ class QuantileTransformer(BaseEstimator, TransformerMixin): computational efficiency. Note that the subsampling procedure may differ for value-identical sparse and dense matrices. - smoothing_noise : bool, optional (default=True) - Perturbs features at training time before computing quantiles by adding - Gaussian noise. It eases interpratation of the computed ``quantiles_`` - when a particular feature value is predominant. - random_state : int, RandomState instance or None, optional (default=None) If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; @@ -2034,29 +2029,18 @@ class QuantileTransformer(BaseEstimator, TransformerMixin): See examples/preprocessing/plot_all_scaling.py for a comparison of the different scalers, transformers, and normalizers. - See examples/preprocessing/plot_smoothing_noise_quantile_transform.py for - an illustration of the ``smoothing_noise`` parameter use. - """ def __init__(self, n_quantiles=1000, output_distribution='uniform', ignore_implicit_zeros=False, subsample=int(1e5), - smoothing_noise=True, random_state=None, copy=True): + random_state=None, copy=True): self.n_quantiles = n_quantiles self.output_distribution = output_distribution self.ignore_implicit_zeros = ignore_implicit_zeros self.subsample = subsample - self.smoothing_noise = smoothing_noise self.random_state = random_state self.copy = copy - def _compute_quantiles_one_column(self, X_col, references, random_state): - """Private function to compute the quantiles for one feature.""" - if self.smoothing_noise is True: - X_col = X_col + random_state.normal(0, 1e-7, size=X_col.shape) - - return np.percentile(X_col, references) - def _dense_fit(self, X, random_state): """Compute percentiles for dense matrices. @@ -2080,9 +2064,7 @@ def _dense_fit(self, X, random_state): size=self.subsample, replace=False) col = col.take(subsample_idx, mode='clip') - self.quantiles_.append( - self._compute_quantiles_one_column(col, references, - random_state)) + self.quantiles_.append(np.percentile(col, references)) self.quantiles_ = np.transpose(self.quantiles_) def _sparse_fit(self, X, random_state): @@ -2127,8 +2109,7 @@ def _sparse_fit(self, X, random_state): self.quantiles_.append([0] * len(references)) else: self.quantiles_.append( - self._compute_quantiles_one_column(column_data, references, - random_state)) + np.percentile(column_data, references)) self.quantiles_ = np.transpose(self.quantiles_) def fit(self, X, y=None): @@ -2206,7 +2187,16 @@ def _transform_col(self, X_col, quantiles, inverse): upper_bound_x) if not inverse: - X_col = np.interp(X_col, quantiles, self.references_) + # Interpolate in one direction and in the other and take the + # mean. This is in case of repeated values in the features + # and hence repeated quantiles + # + # If we don't do this, only one extreme of the duplicated is + # used (the upper when we do assending, and the + # lower for descending). We take the mean of these two + X_col = .5 * (np.interp(X_col, quantiles, self.references_) + - np.interp(-X_col, -quantiles[::-1], + -self.references_[::-1])) else: X_col = np.interp(X_col, self.references_, quantiles) @@ -2333,7 +2323,6 @@ def quantile_transform(X, axis=0, n_quantiles=1000, output_distribution='uniform', ignore_implicit_zeros=False, subsample=int(1e5), - smoothing_noise=True, random_state=None, copy=False): """Transform features using quantiles information. @@ -2380,11 +2369,6 @@ def quantile_transform(X, axis=0, n_quantiles=1000, computational efficiency. Note that the subsampling procedure may differ for value-identical sparse and dense matrices. - smoothing_noise : bool, optional (default=True) - Perturbs features at training time before computing quantiles by adding - Gaussian noise. It eases interpratation of the computed ``quantiles_`` - when a particular feature value is predominant. - random_state : int, RandomState instance or None, optional (default=None) If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; @@ -2429,15 +2413,11 @@ def quantile_transform(X, axis=0, n_quantiles=1000, See examples/preprocessing/plot_all_scaling.py for a comparison of the different scalers, transformers, and normalizers. - See examples/preprocessing/plot_smoothing_noise_quantile_transform.py for - an illustration of the ``smoothing_noise`` parameter use. - """ n = QuantileTransformer(n_quantiles=n_quantiles, output_distribution=output_distribution, subsample=subsample, ignore_implicit_zeros=ignore_implicit_zeros, - smoothing_noise=smoothing_noise, random_state=random_state, copy=copy) if axis == 0: diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index 5f2b5d7bc70cf..af7f28f8162c6 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -19,7 +19,6 @@ from sklearn.utils.testing import assert_array_equal from sklearn.utils.testing import assert_array_less from sklearn.utils.testing import assert_equal -from sklearn.utils.testing import assert_greater from sklearn.utils.testing import assert_greater_equal from sklearn.utils.testing import assert_less_equal from sklearn.utils.testing import assert_raises @@ -144,7 +143,8 @@ def test_polynomial_feature_names(): 'b c^2', 'c^3'], feature_names) # test some unicode poly = PolynomialFeatures(degree=1, include_bias=True).fit(X) - feature_names = poly.get_feature_names([u"\u0001F40D", u"\u262E", u"\u05D0"]) + feature_names = poly.get_feature_names( + [u"\u0001F40D", u"\u262E", u"\u05D0"]) assert_array_equal([u"1", u"\u0001F40D", u"\u262E", u"\u05D0"], feature_names) @@ -857,14 +857,13 @@ def test_robust_scaler_iris_quantiles(): def test_quantile_transform_iris(): X = iris.data # uniform output distribution - transformer = QuantileTransformer(n_quantiles=30, smoothing_noise=False) + transformer = QuantileTransformer(n_quantiles=30) X_trans = transformer.fit_transform(X) X_trans_inv = transformer.inverse_transform(X_trans) assert_array_almost_equal(X, X_trans_inv) # normal output distribution transformer = QuantileTransformer(n_quantiles=30, - output_distribution='normal', - smoothing_noise=False) + output_distribution='normal') X_trans = transformer.fit_transform(X) X_trans_inv = transformer.inverse_transform(X_trans) assert_array_almost_equal(X, X_trans_inv) @@ -943,7 +942,7 @@ def test_quantile_transform_sparse_ignore_zeros(): [0, 1]]) X_sparse = sparse.csc_matrix(X) transformer = QuantileTransformer(ignore_implicit_zeros=True, - n_quantiles=5, smoothing_noise=False) + n_quantiles=5) # dense case -> warning raise assert_warns_message(UserWarning, "'ignore_implicit_zeros' takes effect" @@ -977,16 +976,16 @@ def test_quantile_transform_sparse_ignore_zeros(): assert_almost_equal(X_expected, X_trans.A) transformer = QuantileTransformer(ignore_implicit_zeros=True, - n_quantiles=5, smoothing_noise=False) + n_quantiles=5) X_data = np.array([-1, -1, 1, 0, 0, 0, 1, -1, 1]) X_col = np.array([0, 0, 1, 1, 1, 1, 1, 1, 1]) X_row = np.array([0, 4, 0, 1, 2, 3, 4, 5, 6]) X_sparse = sparse.csc_matrix((X_data, (X_row, X_col))) X_trans = transformer.fit_transform(X_sparse) X_expected = np.array([[0, 1], - [0, 0.5], - [0, 0.5], - [0, 0.5], + [0, 0.375], + [0, 0.375], + [0, 0.375], [0, 1], [0, 0], [0, 1]]) @@ -997,8 +996,7 @@ def test_quantile_transform_sparse_ignore_zeros(): transformer = QuantileTransformer(ignore_implicit_zeros=True, n_quantiles=5, subsample=8, - random_state=0, - smoothing_noise=False) + random_state=0) X_trans = transformer.fit_transform(X_sparse) assert_almost_equal(X_expected, X_trans.A) assert_almost_equal(X_sparse.A, transformer.inverse_transform(X_trans).A) @@ -1011,7 +1009,7 @@ def test_quantile_transform_dense_toy(): [75, 8, 9.5], [100, 10, 0.1]]) - transformer = QuantileTransformer(n_quantiles=5, smoothing_noise=False) + transformer = QuantileTransformer(n_quantiles=5) transformer.fit(X) # using the a uniform output, each entry of X should be map between 0 and 1 @@ -1021,7 +1019,7 @@ def test_quantile_transform_dense_toy(): assert_almost_equal(np.sort(X_trans, axis=0), X_expected) X_test = np.array([ - [-1, 1, 0], + [-1, 1, 0], [101, 11, 10], ]) X_expected = np.array([ @@ -1095,7 +1093,7 @@ def test_quantile_transform_sparse_toy(): X = sparse.csc_matrix(X) - transformer = QuantileTransformer(n_quantiles=10, smoothing_noise=False) + transformer = QuantileTransformer(n_quantiles=10) transformer.fit(X) X_trans = transformer.fit_transform(X) @@ -1105,9 +1103,8 @@ def test_quantile_transform_sparse_toy(): X_trans_inv = transformer.inverse_transform(X_trans) assert_array_almost_equal(X.toarray(), X_trans_inv.toarray()) - transformer_dense = QuantileTransformer(n_quantiles=10, - smoothing_noise=False).fit( - X.toarray()) + transformer_dense = QuantileTransformer(n_quantiles=10).fit( + X.toarray()) X_trans = transformer_dense.transform(X) assert_array_almost_equal(np.min(X_trans.toarray(), axis=0), 0.) @@ -1137,11 +1134,9 @@ def test_quantile_transform_bounds(): # check sparse and dense are consistent X_trans = QuantileTransformer(n_quantiles=3, - smoothing_noise=False, random_state=0).fit_transform(X_dense) assert_array_almost_equal(X_trans, X_dense) X_trans_sp = QuantileTransformer(n_quantiles=3, - smoothing_noise=False, random_state=0).fit_transform(X_sparse) assert_array_almost_equal(X_trans_sp.A, X_dense) assert_array_almost_equal(X_trans, X_trans_sp.A) @@ -1154,14 +1149,13 @@ def test_quantile_transform_bounds(): X1 = np.array([[0, 0.1], [0, 0.5], [1, 0.1]]) - transformer = QuantileTransformer(n_quantiles=3, - smoothing_noise=False).fit(X) + transformer = QuantileTransformer(n_quantiles=3).fit(X) X_trans = transformer.transform(X1) assert_array_almost_equal(X_trans, X1) # check that values outside of the range learned will be mapped properly. X = np.random.random((1000, 1)) - transformer = QuantileTransformer(smoothing_noise=False) + transformer = QuantileTransformer() transformer.fit(X) assert_equal(transformer.transform(-10), transformer.transform(np.min(X))) assert_equal(transformer.transform(10), transformer.transform(np.max(X))) @@ -1173,57 +1167,13 @@ def test_quantile_transform_bounds(): np.max(transformer.references_))) -def test_quantile_transform_add_noise_subsamples(): - # toy examples - unique_feature = [0, 0.5, 1] - X = np.transpose([[unique_feature[0]] * 1 + - [unique_feature[1]] * 7 + - [unique_feature[2]] * 2]) - transformer = QuantileTransformer(n_quantiles=100, smoothing_noise=True, - random_state=0) - transformer.fit(X) - # check that the feature values associated to quantiles are strictly - # monitically increasing as suggested by the 'interp' function from numpy - diff_quantiles = np.diff(transformer.quantiles_, axis=0) - map(assert_greater, diff_quantiles, [0] * len(diff_quantiles)) +def test_quantile_transform_and_inverse(): # iris dataset X = iris.data - transformer = QuantileTransformer(n_quantiles=1000, smoothing_noise=True, - random_state=0) + transformer = QuantileTransformer(n_quantiles=1000, random_state=0) X_trans = transformer.fit_transform(X) X_trans_inv = transformer.inverse_transform(X_trans) assert_array_almost_equal(X, X_trans_inv) - # check that the feature values associated to quantiles are - # strictly monitically increasing as suggested by the 'interp' - # function from numpy - diff_quantiles = np.diff(transformer.quantiles_, axis=0) - for dq in diff_quantiles.T: - map(assert_greater, dq, [0] * len(dq)) - - -def test_quantile_transform_numpy_interp_behaviour(): - # The quantile transformer relies on the numpy implementation of 'interp' - # function. In the presence of a predominant constant feature values or a - # large number of quantiles, a single feature value is mapped to different - # quantiles. The default behaviour of 'interp' will be returning the - # largest quantile associated to the feature value. This test attends to - # check if there is any behavorial changes in the 'interp' function and to - # act accordingly. This implementation subtilities is mention in the - # docstring of the 'interp' function. - - unique_feature = [0, 0.5, 1] - X = np.transpose([[unique_feature[0]] * 1 + - [unique_feature[1]] * 7 + - [unique_feature[2]] * 2]) - qt = QuantileTransformer(n_quantiles=100, smoothing_noise=False) - qt.fit(X) - ref = np.linspace(0., 1., num=qt.n_quantiles) - max_quantiles_idx = [np.flatnonzero(qt.quantiles_ == unique_feature[i])[-1] - for i in range(len(unique_feature))] - X_trans = np.transpose([[ref[max_quantiles_idx[0]]] * 1 + - [ref[max_quantiles_idx[1]]] * 7 + - [ref[max_quantiles_idx[2]]] * 2]) - assert_array_almost_equal(qt.transform(X), X_trans) def test_robust_scaler_invalid_range():