diff --git a/doc/glossary.rst b/doc/glossary.rst index 533a8eac63d04..37e96a7d394f7 100644 --- a/doc/glossary.rst +++ b/doc/glossary.rst @@ -294,7 +294,7 @@ General Concepts convergence of the training loss, to avoid over-fitting. This is generally done by monitoring the generalization score on a validation set. When available, it is activated through the parameter - ``early_stopping`` or by setting a postive :term:`n_iter_no_change`. + ``early_stopping`` or by setting a positive :term:`n_iter_no_change`. estimator instance We sometimes use this terminology to distinguish an :term:`estimator` diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index 70f24def72f20..dd1f798ccb3aa 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -309,20 +309,34 @@ Power transforms are a family of parametric, monotonic transformations that aim to map data from any distribution to as close to a Gaussian distribution as possible in order to stabilize variance and minimize skewness. -:class:`PowerTransformer` currently provides one such power transformation, -the Box-Cox transform. The Box-Cox transform is given by: +:class:`PowerTransformer` currently provides two such power transformations, +the Yeo-Johnson transform and the Box-Cox transform. + +The Yeo-Johnson transform is given by: .. math:: - y_i^{(\lambda)} = + x_i^{(\lambda)} = \begin{cases} - \dfrac{y_i^\lambda - 1}{\lambda} & \text{if } \lambda \neq 0, \\[8pt] - \ln{(y_i)} & \text{if } \lambda = 0, + [(x_i + 1)^\lambda - 1] / \lambda & \text{if } \lambda \neq 0, x_i \geq 0, \\[8pt] + \ln{(x_i) + 1} & \text{if } \lambda = 0, x_i \geq 0 \\[8pt] + -[(-x_i + 1)^{2 - \lambda} - 1] / (2 - \lambda) & \text{if } \lambda \neq 2, x_i < 0, \\[8pt] + - \ln (- x_i + 1) & \text{if } \lambda = 2, x_i < 0 \end{cases} -Box-Cox can only be applied to strictly positive data. The transformation is -parameterized by :math:`\lambda`, which is determined through maximum likelihood -estimation. Here is an example of using Box-Cox to map samples drawn from a -lognormal distribution to a normal distribution:: +while the Box-Cox transform is given by: + +.. math:: + x_i^{(\lambda)} = + \begin{cases} + \dfrac{x_i^\lambda - 1}{\lambda} & \text{if } \lambda \neq 0, \\[8pt] + \ln{(x_i)} & \text{if } \lambda = 0, + \end{cases} + + +Box-Cox can only be applied to strictly positive data. In both methods, the +transformation is parameterized by :math:`\lambda`, which is determined through +maximum likelihood estimation. Here is an example of using Box-Cox to map +samples drawn from a lognormal distribution to a normal distribution:: >>> pt = preprocessing.PowerTransformer(method='box-cox', standardize=False) >>> X_lognormal = np.random.RandomState(616).lognormal(size=(3, 3)) @@ -339,13 +353,14 @@ While the above example sets the `standardize` option to `False`, :class:`PowerTransformer` will apply zero-mean, unit-variance normalization to the transformed output by default. -Below are examples of Box-Cox applied to various probability distributions. -Note that when applied to certain distributions, Box-Cox achieves very -Gaussian-like results, but with others, it is ineffective. This highlights -the importance of visualizing the data before and after transformation. +Below are examples of Box-Cox and Yeo-Johnson applied to various probability +distributions. Note that when applied to certain distributions, the power +transforms achieve very Gaussian-like results, but with others, they are +ineffective. This highlights the importance of visualizing the data before and +after transformation. -.. figure:: ../auto_examples/preprocessing/images/sphx_glr_plot_power_transformer_001.png - :target: ../auto_examples/preprocessing/plot_power_transformer.html +.. figure:: ../auto_examples/preprocessing/images/sphx_glr_plot_map_data_to_normal_001.png + :target: ../auto_examples/preprocessing/plot_map_data_to_normal.html :align: center :scale: 100 diff --git a/doc/whats_new/v0.20.rst b/doc/whats_new/v0.20.rst index 86c8d7a8ddab7..5b4a239368caf 100644 --- a/doc/whats_new/v0.20.rst +++ b/doc/whats_new/v0.20.rst @@ -136,12 +136,15 @@ Preprocessing DataFrames. :issue:`9012` by `Andreas Müller`_ and `Joris Van den Bossche`_, and :issue:`11315` by :user:`Thomas Fan `. -- Added :class:`preprocessing.PowerTransformer`, which implements the Box-Cox - power transformation, allowing users to map data from any distribution to a - Gaussian distribution. This is useful as a variance-stabilizing transformation - in situations where normality and homoscedasticity are desirable. +- Added :class:`preprocessing.PowerTransformer`, which implements the + Yeo-Johnson and Box-Cox power transformations. Power transformations try to + find a set of feature-wise parametric transformations to approximately map + data to a Gaussian distribution centered at zero and with unit variance. + This is useful as a variance-stabilizing transformation in situations where + normality and homoscedasticity are desirable. :issue:`10210` by :user:`Eric Chang ` and - :user:`Maniteja Nandana `. + :user:`Maniteja Nandana `, and :issue:`11520` by :user:`Nicolas + Hug `. - Added the :class:`compose.TransformedTargetRegressor` which transforms the target y before fitting a regression model. The predictions are mapped diff --git a/examples/preprocessing/plot_all_scaling.py b/examples/preprocessing/plot_all_scaling.py index 92cd635e2a06d..07fd3662da448 100755 --- a/examples/preprocessing/plot_all_scaling.py +++ b/examples/preprocessing/plot_all_scaling.py @@ -87,6 +87,8 @@ MaxAbsScaler().fit_transform(X)), ('Data after robust scaling', RobustScaler(quantile_range=(25, 75)).fit_transform(X)), + ('Data after power transformation (Yeo-Johnson)', + PowerTransformer(method='yeo-johnson').fit_transform(X)), ('Data after power transformation (Box-Cox)', PowerTransformer(method='box-cox').fit_transform(X)), ('Data after quantile transformation (gaussian pdf)', @@ -294,21 +296,21 @@ def make_plot(item_idx): make_plot(4) ############################################################################## -# PowerTransformer (Box-Cox) -# -------------------------- +# PowerTransformer +# ---------------- # -# ``PowerTransformer`` applies a power transformation to each -# feature to make the data more Gaussian-like. Currently, -# ``PowerTransformer`` implements the Box-Cox transform. The Box-Cox transform -# finds the optimal scaling factor to stabilize variance and mimimize skewness -# through maximum likelihood estimation. By default, ``PowerTransformer`` also -# applies zero-mean, unit variance normalization to the transformed output. -# Note that Box-Cox can only be applied to positive, non-zero data. Income and -# number of households happen to be strictly positive, but if negative values -# are present, a constant can be added to each feature to shift it into the -# positive range - this is known as the two-parameter Box-Cox transform. +# ``PowerTransformer`` applies a power transformation to each feature to make +# the data more Gaussian-like. Currently, ``PowerTransformer`` implements the +# Yeo-Johnson and Box-Cox transforms. The power transform finds the optimal +# scaling factor to stabilize variance and mimimize skewness through maximum +# likelihood estimation. By default, ``PowerTransformer`` also applies +# zero-mean, unit variance normalization to the transformed output. Note that +# Box-Cox can only be applied to strictly positive data. Income and number of +# households happen to be strictly positive, but if negative values are present +# the Yeo-Johnson transformed is to be preferred. make_plot(5) +make_plot(6) ############################################################################## # QuantileTransformer (Gaussian output) @@ -319,7 +321,7 @@ def make_plot(item_idx): # Note that this non-parametetric transformer introduces saturation artifacts # for extreme values. -make_plot(6) +make_plot(7) ################################################################### # QuantileTransformer (uniform output) @@ -337,7 +339,7 @@ def make_plot(item_idx): # any outlier by setting them to the a priori defined range boundaries (0 and # 1). -make_plot(7) +make_plot(8) ############################################################################## # Normalizer @@ -350,6 +352,6 @@ def make_plot(item_idx): # transformed data only lie in the positive quadrant. This would not be the # case if some original features had a mix of positive and negative values. -make_plot(8) +make_plot(9) plt.show() diff --git a/examples/preprocessing/plot_map_data_to_normal.py b/examples/preprocessing/plot_map_data_to_normal.py new file mode 100644 index 0000000000000..b8b7625f3c02b --- /dev/null +++ b/examples/preprocessing/plot_map_data_to_normal.py @@ -0,0 +1,137 @@ +""" +================================= +Map data to a normal distribution +================================= + +This example demonstrates the use of the Box-Cox and Yeo-Johnson transforms +through :class:`preprocessing.PowerTransformer` to map data from various +distributions to a normal distribution. + +The power transform is useful as a transformation in modeling problems where +homoscedasticity and normality are desired. Below are examples of Box-Cox and +Yeo-Johnwon applied to six different probability distributions: Lognormal, +Chi-squared, Weibull, Gaussian, Uniform, and Bimodal. + +Note that the transformations successfully map the data to a normal +distribution when applied to certain datasets, but are ineffective with others. +This highlights the importance of visualizing the data before and after +transformation. + +Also note that even though Box-Cox seems to perform better than Yeo-Johnson for +lognormal and chi-squared distributions, keep in mind that Box-Cox does not +support inputs with negative values. + +For comparison, we also add the output from +:class:`preprocessing.QuantileTransformer`. It can force any arbitrary +distribution into a gaussian, provided that there are enough training samples +(thousands). Because it is a non-parametric method, it is harder to interpret +than the parametric ones (Box-Cox and Yeo-Johnson). + +On "small" datasets (less than a few hundred points), the quantile transformer +is prone to overfitting. The use of the power transform is then recommended. +""" + +# Author: Eric Chang +# Nicolas Hug +# License: BSD 3 clause + +import numpy as np +import matplotlib.pyplot as plt + +from sklearn.preprocessing import PowerTransformer +from sklearn.preprocessing import QuantileTransformer +from sklearn.model_selection import train_test_split + +print(__doc__) + + +N_SAMPLES = 1000 +FONT_SIZE = 6 +BINS = 30 + + +rng = np.random.RandomState(304) +bc = PowerTransformer(method='box-cox') +yj = PowerTransformer(method='yeo-johnson') +qt = QuantileTransformer(output_distribution='normal', random_state=rng) +size = (N_SAMPLES, 1) + + +# lognormal distribution +X_lognormal = rng.lognormal(size=size) + +# chi-squared distribution +df = 3 +X_chisq = rng.chisquare(df=df, size=size) + +# weibull distribution +a = 50 +X_weibull = rng.weibull(a=a, size=size) + +# gaussian distribution +loc = 100 +X_gaussian = rng.normal(loc=loc, size=size) + +# uniform distribution +X_uniform = rng.uniform(low=0, high=1, size=size) + +# bimodal distribution +loc_a, loc_b = 100, 105 +X_a, X_b = rng.normal(loc=loc_a, size=size), rng.normal(loc=loc_b, size=size) +X_bimodal = np.concatenate([X_a, X_b], axis=0) + + +# create plots +distributions = [ + ('Lognormal', X_lognormal), + ('Chi-squared', X_chisq), + ('Weibull', X_weibull), + ('Gaussian', X_gaussian), + ('Uniform', X_uniform), + ('Bimodal', X_bimodal) +] + +colors = ['firebrick', 'darkorange', 'goldenrod', + 'seagreen', 'royalblue', 'darkorchid'] + +fig, axes = plt.subplots(nrows=8, ncols=3, figsize=plt.figaspect(2)) +axes = axes.flatten() +axes_idxs = [(0, 3, 6, 9), (1, 4, 7, 10), (2, 5, 8, 11), (12, 15, 18, 21), + (13, 16, 19, 22), (14, 17, 20, 23)] +axes_list = [(axes[i], axes[j], axes[k], axes[l]) + for (i, j, k, l) in axes_idxs] + + +for distribution, color, axes in zip(distributions, colors, axes_list): + name, X = distribution + X_train, X_test = train_test_split(X, test_size=.5) + + # perform power transforms and quantile transform + X_trans_bc = bc.fit(X_train).transform(X_test) + lmbda_bc = round(bc.lambdas_[0], 2) + X_trans_yj = yj.fit(X_train).transform(X_test) + lmbda_yj = round(yj.lambdas_[0], 2) + X_trans_qt = qt.fit(X_train).transform(X_test) + + ax_original, ax_bc, ax_yj, ax_qt = axes + + ax_original.hist(X_train, color=color, bins=BINS) + ax_original.set_title(name, fontsize=FONT_SIZE) + ax_original.tick_params(axis='both', which='major', labelsize=FONT_SIZE) + + for ax, X_trans, meth_name, lmbda in zip( + (ax_bc, ax_yj, ax_qt), + (X_trans_bc, X_trans_yj, X_trans_qt), + ('Box-Cox', 'Yeo-Johnson', 'Quantile transform'), + (lmbda_bc, lmbda_yj, None)): + ax.hist(X_trans, color=color, bins=BINS) + title = 'After {}'.format(meth_name) + if lmbda is not None: + title += '\n$\lambda$ = {}'.format(lmbda) + ax.set_title(title, fontsize=FONT_SIZE) + ax.tick_params(axis='both', which='major', labelsize=FONT_SIZE) + ax.set_xlim([-3.5, 3.5]) + + +plt.tight_layout() +plt.show() diff --git a/examples/preprocessing/plot_power_transformer.py b/examples/preprocessing/plot_power_transformer.py deleted file mode 100644 index 52ce0d3121f73..0000000000000 --- a/examples/preprocessing/plot_power_transformer.py +++ /dev/null @@ -1,109 +0,0 @@ -""" -========================================================== -Using PowerTransformer to apply the Box-Cox transformation -========================================================== - -This example demonstrates the use of the Box-Cox transform through -:class:`preprocessing.PowerTransformer` to map data from various distributions -to a normal distribution. - -Box-Cox is useful as a transformation in modeling problems where -homoscedasticity and normality are desired. Below are examples of Box-Cox -applied to six different probability distributions: Lognormal, Chi-squared, -Weibull, Gaussian, Uniform, and Bimodal. - -Note that the transformation successfully maps the data to a normal -distribution when applied to certain datasets, but is ineffective with others. -This highlights the importance of visualizing the data before and after -transformation. Also note that while the standardize option is set to False for -the plot examples, by default, :class:`preprocessing.PowerTransformer` also -applies zero-mean, unit-variance standardization to the transformed outputs. -""" - -# Author: Eric Chang -# License: BSD 3 clause - -import numpy as np -import matplotlib.pyplot as plt - -from sklearn.preprocessing import PowerTransformer, minmax_scale - -print(__doc__) - - -N_SAMPLES = 3000 -FONT_SIZE = 6 -BINS = 100 - - -pt = PowerTransformer(method='box-cox', standardize=False) -rng = np.random.RandomState(304) -size = (N_SAMPLES, 1) - - -# lognormal distribution -X_lognormal = rng.lognormal(size=size) - -# chi-squared distribution -df = 3 -X_chisq = rng.chisquare(df=df, size=size) - -# weibull distribution -a = 50 -X_weibull = rng.weibull(a=a, size=size) - -# gaussian distribution -loc = 100 -X_gaussian = rng.normal(loc=loc, size=size) - -# uniform distribution -X_uniform = rng.uniform(low=0, high=1, size=size) - -# bimodal distribution -loc_a, loc_b = 100, 105 -X_a, X_b = rng.normal(loc=loc_a, size=size), rng.normal(loc=loc_b, size=size) -X_bimodal = np.concatenate([X_a, X_b], axis=0) - - -# create plots -distributions = [ - ('Lognormal', X_lognormal), - ('Chi-squared', X_chisq), - ('Weibull', X_weibull), - ('Gaussian', X_gaussian), - ('Uniform', X_uniform), - ('Bimodal', X_bimodal) -] - -colors = ['firebrick', 'darkorange', 'goldenrod', - 'seagreen', 'royalblue', 'darkorchid'] - -fig, axes = plt.subplots(nrows=4, ncols=3) -axes = axes.flatten() -axes_idxs = [(0, 3), (1, 4), (2, 5), (6, 9), (7, 10), (8, 11)] -axes_list = [(axes[i], axes[j]) for i, j in axes_idxs] - - -for distribution, color, axes in zip(distributions, colors, axes_list): - name, X = distribution - # scale all distributions to the range [0, 10] - X = minmax_scale(X, feature_range=(1e-10, 10)) - - # perform power transform - X_trans = pt.fit_transform(X) - lmbda = round(pt.lambdas_[0], 2) - - ax_original, ax_trans = axes - - ax_original.hist(X, color=color, bins=BINS) - ax_original.set_title(name, fontsize=FONT_SIZE) - ax_original.tick_params(axis='both', which='major', labelsize=FONT_SIZE) - - ax_trans.hist(X_trans, color=color, bins=BINS) - ax_trans.set_title('{} after Box-Cox, $\lambda$ = {}'.format(name, lmbda), - fontsize=FONT_SIZE) - ax_trans.tick_params(axis='both', which='major', labelsize=FONT_SIZE) - - -plt.tight_layout() -plt.show() diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 2b17f41010eeb..1256b6522e928 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -17,6 +17,7 @@ import numpy as np from scipy import sparse from scipy import stats +from scipy import optimize from ..base import BaseEstimator, TransformerMixin from ..externals import six @@ -2400,10 +2401,12 @@ class PowerTransformer(BaseEstimator, TransformerMixin): modeling issues related to heteroscedasticity (non-constant variance), or other situations where normality is desired. - Currently, PowerTransformer supports the Box-Cox transform. Box-Cox - requires input data to be strictly positive. The optimal parameter - for stabilizing variance and minimizing skewness is estimated through - maximum likelihood. + Currently, PowerTransformer supports the Box-Cox transform and the + Yeo-Johson transform. The optimal parameter for stabilizing variance and + minimizing skewness is estimated through maximum likelihood. + + Box-Cox requires input data to be strictly positive, while Yeo-Johnson + supports both positive or negative data. By default, zero-mean, unit-variance normalization is applied to the transformed data. @@ -2412,9 +2415,11 @@ class PowerTransformer(BaseEstimator, TransformerMixin): Parameters ---------- - method : str, (default='box-cox') - The power transform method. Currently, 'box-cox' (Box-Cox transform) - is the only option available. + method : str, (default='yeo-johnson') + The power transform method. Available methods are: + + - 'yeo-johnson' [1]_, works with positive and negative values + - 'box-cox' [2]_, only works with strictly positive values standardize : boolean, default=True Set to True to apply zero-mean, unit-variance normalization to the @@ -2435,13 +2440,13 @@ class PowerTransformer(BaseEstimator, TransformerMixin): >>> pt = PowerTransformer() >>> data = [[1, 2], [3, 2], [4, 5]] >>> print(pt.fit(data)) - PowerTransformer(copy=True, method='box-cox', standardize=True) - >>> print(pt.lambdas_) # doctest: +ELLIPSIS - [ 1.051... -2.345...] - >>> print(pt.transform(data)) # doctest: +ELLIPSIS - [[-1.332... -0.707...] - [ 0.256... -0.707...] - [ 1.076... 1.414...]] + PowerTransformer(copy=True, method='yeo-johnson', standardize=True) + >>> print(pt.lambdas_) + [1.38668178e+00 5.93926346e-09] + >>> print(pt.transform(data)) + [[-1.31616039 -0.70710678] + [ 0.20998268 -0.70710678] + [ 1.1061777 1.41421356]] See also -------- @@ -2461,21 +2466,24 @@ class PowerTransformer(BaseEstimator, TransformerMixin): References ---------- - G.E.P. Box and D.R. Cox, "An Analysis of Transformations", Journal of the - Royal Statistical Society B, 26, 211-252 (1964). + .. [1] I.K. Yeo and R.A. Johnson, "A new family of power transformations to + improve normality or symmetry." Biometrika, 87(4), pp.954-959, + (2000). + + .. [2] G.E.P. Box and D.R. Cox, "An Analysis of Transformations", Journal + of the Royal Statistical Society B, 26, 211-252 (1964). """ - def __init__(self, method='box-cox', standardize=True, copy=True): + def __init__(self, method='yeo-johnson', standardize=True, copy=True): self.method = method self.standardize = standardize self.copy = copy def fit(self, X, y=None): - """Estimate the optimal parameter for each feature. + """Estimate the optimal parameter lambda for each feature. - The optimal parameter for minimizing skewness is estimated - on each feature independently. If the method is Box-Cox, - the lambdas are estimated using maximum likelihood. + The optimal lambda parameter for minimizing skewness is estimated on + each feature independently using maximum likelihood. Parameters ---------- @@ -2488,27 +2496,44 @@ def fit(self, X, y=None): ------- self : object """ + self._fit(X, y=y, force_transform=False) + return self + + def fit_transform(self, X, y=None): + return self._fit(X, y, force_transform=True) + + def _fit(self, X, y=None, force_transform=False): X = self._check_input(X, check_positive=True, check_method=True) - self.lambdas_ = [] - transformed = [] + if not self.copy and not force_transform: # if call from fit() + X = X.copy() # force copy so that fit does not change X inplace + optim_function = {'box-cox': self._box_cox_optimize, + 'yeo-johnson': self._yeo_johnson_optimize + }[self.method] + self.lambdas_ = [] for col in X.T: - # the computation of lambda is influenced by NaNs and we need to - # get rid of them to compute them. - _, lmbda = stats.boxcox(col[~np.isnan(col)], lmbda=None) - col_trans = boxcox(col, lmbda) - self.lambdas_.append(lmbda) - transformed.append(col_trans) - + with np.errstate(invalid='ignore'): # hide NaN warnings + lmbda = optim_function(col) + self.lambdas_.append(lmbda) self.lambdas_ = np.array(self.lambdas_) - transformed = np.array(transformed) + + if self.standardize or force_transform: + transform_function = {'box-cox': boxcox, + 'yeo-johnson': self._yeo_johnson_transform + }[self.method] + for i, lmbda in enumerate(self.lambdas_): + with np.errstate(invalid='ignore'): # hide NaN warnings + X[:, i] = transform_function(X[:, i], lmbda) if self.standardize: - self._scaler = StandardScaler() - self._scaler.fit(X=transformed.T) + self._scaler = StandardScaler(copy=False) + if force_transform: + X = self._scaler.fit_transform(X) + else: + self._scaler.fit(X) - return self + return X def transform(self, X): """Apply the power transform to each feature using the fitted lambdas. @@ -2517,12 +2542,21 @@ def transform(self, X): ---------- X : array-like, shape (n_samples, n_features) The data to be transformed using a power transformation. + + Returns + ------- + X_trans : array-like, shape (n_samples, n_features) + The transformed data. """ check_is_fitted(self, 'lambdas_') X = self._check_input(X, check_positive=True, check_shape=True) + transform_function = {'box-cox': boxcox, + 'yeo-johnson': self._yeo_johnson_transform + }[self.method] for i, lmbda in enumerate(self.lambdas_): - X[:, i] = boxcox(X[:, i], lmbda) + with np.errstate(invalid='ignore'): # hide NaN warnings + X[:, i] = transform_function(X[:, i], lmbda) if self.standardize: X = self._scaler.transform(X) @@ -2539,10 +2573,26 @@ def inverse_transform(self, X): else: X = (X_trans * lambda + 1) ** (1 / lambda) + The inverse of the Yeo-Johnson transformation is given by:: + + if X >= 0 and lambda == 0: + X = exp(X_trans) - 1 + elif X >= 0 and lambda != 0: + X = (X_trans * lambda + 1) ** (1 / lambda) - 1 + elif X < 0 and lambda != 2: + X = 1 - (-(2 - lambda) * X_trans + 1) ** (1 / (2 - lambda)) + elif X < 0 and lambda == 2: + X = 1 - exp(-X_trans) + Parameters ---------- X : array-like, shape (n_samples, n_features) The transformed data. + + Returns + ------- + X : array-like, shape (n_samples, n_features) + The original data """ check_is_fitted(self, 'lambdas_') X = self._check_input(X, check_shape=True) @@ -2550,16 +2600,120 @@ def inverse_transform(self, X): if self.standardize: X = self._scaler.inverse_transform(X) + inv_fun = {'box-cox': self._box_cox_inverse_tranform, + 'yeo-johnson': self._yeo_johnson_inverse_transform + }[self.method] for i, lmbda in enumerate(self.lambdas_): - x = X[:, i] - if lmbda == 0: - x_inv = np.exp(x) - else: - x_inv = (x * lmbda + 1) ** (1 / lmbda) - X[:, i] = x_inv + with np.errstate(invalid='ignore'): # hide NaN warnings + X[:, i] = inv_fun(X[:, i], lmbda) return X + def _box_cox_inverse_tranform(self, x, lmbda): + """Return inverse-transformed input x following Box-Cox inverse + transform with parameter lambda. + """ + if lmbda == 0: + x_inv = np.exp(x) + else: + x_inv = (x * lmbda + 1) ** (1 / lmbda) + + return x_inv + + def _yeo_johnson_inverse_transform(self, x, lmbda): + """Return inverse-transformed input x following Yeo-Johnson inverse + transform with parameter lambda. + + Notes + ----- + We're comparing lmbda to 1e-19 instead of strict equality to 0. See + scipy/special/_boxcox.pxd for a rationale behind this + """ + x_inv = np.zeros(x.shape, dtype=x.dtype) + pos = x >= 0 + + # when x >= 0 + if lmbda < 1e-19: + x_inv[pos] = np.exp(x[pos]) - 1 + else: # lmbda != 0 + x_inv[pos] = np.power(x[pos] * lmbda + 1, 1 / lmbda) - 1 + + # when x < 0 + if lmbda < 2 - 1e-19: + x_inv[~pos] = 1 - np.power(-(2 - lmbda) * x[~pos] + 1, + 1 / (2 - lmbda)) + else: # lmbda == 2 + x_inv[~pos] = 1 - np.exp(-x[~pos]) + + return x_inv + + def _yeo_johnson_transform(self, x, lmbda): + """Return transformed input x following Yeo-Johnson transform with + parameter lambda. + + Notes + ----- + We're comparing lmbda to 1e-19 instead of strict equality to 0. See + scipy/special/_boxcox.pxd for a rationale behind this + """ + + out = np.zeros(shape=x.shape, dtype=x.dtype) + pos = x >= 0 # binary mask + + # when x >= 0 + if lmbda < 1e-19: + out[pos] = np.log(x[pos] + 1) + else: # lmbda != 0 + out[pos] = (np.power(x[pos] + 1, lmbda) - 1) / lmbda + + # when x < 0 + if lmbda < 2 - 1e-19: + out[~pos] = -(np.power(-x[~pos] + 1, 2 - lmbda) - 1) / (2 - lmbda) + else: # lmbda == 2 + out[~pos] = -np.log(-x[~pos] + 1) + + return out + + def _box_cox_optimize(self, x): + """Find and return optimal lambda parameter of the Box-Cox transform by + MLE, for observed data x. + + We here use scipy builtins which uses the brent optimizer. + """ + # the computation of lambda is influenced by NaNs so we need to + # get rid of them + _, lmbda = stats.boxcox(x[~np.isnan(x)], lmbda=None) + + return lmbda + + def _yeo_johnson_optimize(self, x): + """Find and return optimal lambda parameter of the Yeo-Johnson + transform by MLE, for observed data x. + + Like for Box-Cox, MLE is done via the brent optimizer. + """ + + def _neg_log_likelihood(lmbda): + """Return the negative log likelihood of the observed data x as a + function of lambda.""" + x_trans = self._yeo_johnson_transform(x, lmbda) + n_samples = x.shape[0] + + # Estimated mean and variance of the normal distribution + est_mean = x_trans.sum() / n_samples + est_var = np.power(x_trans - est_mean, 2).sum() / n_samples + + loglike = -n_samples / 2 * np.log(est_var) + loglike += (lmbda - 1) * (np.sign(x) * np.log(np.abs(x) + 1)).sum() + + return -loglike + + # the computation of lambda is influenced by NaNs so we need to + # get rid of them + x = x[~np.isnan(x)] + # choosing bracket -2, 2 like for boxcox + return optimize.brent(_neg_log_likelihood, brack=(-2, 2)) + def _check_input(self, X, check_positive=False, check_shape=False, check_method=False): """Validate the input before fit and transform. @@ -2569,7 +2723,8 @@ def _check_input(self, X, check_positive=False, check_shape=False, X : array-like, shape (n_samples, n_features) check_positive : bool - If True, check that all data is positive and non-zero. + If True, check that all data is positive and non-zero (only if + ``self.method=='box-cox'``). check_shape : bool If True, check that n_features matches the length of self.lambdas_ @@ -2593,7 +2748,7 @@ def _check_input(self, X, check_positive=False, check_shape=False, "than fitting data. Should have {n}, data has {m}" .format(n=len(self.lambdas_), m=X.shape[1])) - valid_methods = ('box-cox',) + valid_methods = ('box-cox', 'yeo-johnson') if check_method and self.method not in valid_methods: raise ValueError("'method' must be one of {}, " "got {} instead." diff --git a/sklearn/preprocessing/tests/test_common.py b/sklearn/preprocessing/tests/test_common.py index cbb77e4884040..ac904d99e8af3 100644 --- a/sklearn/preprocessing/tests/test_common.py +++ b/sklearn/preprocessing/tests/test_common.py @@ -41,7 +41,8 @@ def _get_valid_samples_by_column(X, col): (MinMaxScaler(), minmax_scale, False, False), (StandardScaler(), scale, False, False), (StandardScaler(with_mean=False), scale, True, False), - (PowerTransformer(), power_transform, False, True), + (PowerTransformer('yeo-johnson'), power_transform, False, False), + (PowerTransformer('box-cox'), power_transform, False, True), (QuantileTransformer(n_quantiles=10), quantile_transform, True, False), (RobustScaler(), robust_scale, False, False), (RobustScaler(with_centering=False), robust_scale, True, False)] diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index 2ff9dfd776a03..f5ea7a9dd8edc 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -62,6 +62,7 @@ from sklearn.pipeline import Pipeline from sklearn.model_selection import cross_val_predict from sklearn.svm import SVR +from sklearn.utils import shuffle from sklearn import datasets @@ -2003,13 +2004,26 @@ def test_quantile_transform_valid_axis(): ". Got axis=2", quantile_transform, X.T, axis=2) -def test_power_transformer_notfitted(): - pt = PowerTransformer(method='box-cox') +@pytest.mark.parametrize("method", ['box-cox', 'yeo-johnson']) +def test_power_transformer_notfitted(method): + pt = PowerTransformer(method=method) X = np.abs(X_1col) assert_raises(NotFittedError, pt.transform, X) assert_raises(NotFittedError, pt.inverse_transform, X) +@pytest.mark.parametrize('method', ['box-cox', 'yeo-johnson']) +@pytest.mark.parametrize('standardize', [True, False]) +@pytest.mark.parametrize('X', [X_1col, X_2d]) +def test_power_transformer_inverse(method, standardize, X): + # Make sure we get the original input when applying transform and then + # inverse transform + X = np.abs(X) if method == 'box-cox' else X + pt = PowerTransformer(method=method, standardize=standardize) + X_trans = pt.fit_transform(X) + assert_almost_equal(X, pt.inverse_transform(X_trans)) + + def test_power_transformer_1d(): X = np.abs(X_1col) @@ -2061,11 +2075,12 @@ def test_power_transformer_2d(): assert isinstance(pt.lambdas_, np.ndarray) -def test_power_transformer_strictly_positive_exception(): +def test_power_transformer_boxcox_strictly_positive_exception(): + # Exceptions should be raised for negative arrays and zero arrays when + # method is boxcox + pt = PowerTransformer(method='box-cox') pt.fit(np.abs(X_2d)) - - # Exceptions should be raised for negative arrays and zero arrays X_with_negatives = X_2d not_positive_message = 'strictly positive' @@ -2076,7 +2091,7 @@ def test_power_transformer_strictly_positive_exception(): pt.fit, X_with_negatives) assert_raise_message(ValueError, not_positive_message, - power_transform, X_with_negatives) + power_transform, X_with_negatives, 'box-cox') assert_raise_message(ValueError, not_positive_message, pt.transform, np.zeros(X_2d.shape)) @@ -2085,11 +2100,19 @@ def test_power_transformer_strictly_positive_exception(): pt.fit, np.zeros(X_2d.shape)) assert_raise_message(ValueError, not_positive_message, - power_transform, np.zeros(X_2d.shape)) + power_transform, np.zeros(X_2d.shape), 'box-cox') -def test_power_transformer_shape_exception(): - pt = PowerTransformer(method='box-cox') +@pytest.mark.parametrize('X', [X_2d, np.abs(X_2d), -np.abs(X_2d), + np.zeros(X_2d.shape)]) +def test_power_transformer_yeojohnson_any_input(X): + # Yeo-Johnson method should support any kind of input + power_transform(X, method='yeo-johnson') + + +@pytest.mark.parametrize("method", ['box-cox', 'yeo-johnson']) +def test_power_transformer_shape_exception(method): + pt = PowerTransformer(method=method) X = np.abs(X_2d) pt.fit(X) @@ -2122,3 +2145,136 @@ def test_power_transformer_lambda_zero(): pt.lambdas_ = np.array([0]) X_trans = pt.transform(X) assert_array_almost_equal(pt.inverse_transform(X_trans), X) + + +def test_power_transformer_lambda_one(): + # Make sure lambda = 1 corresponds to the identity for yeo-johnson + pt = PowerTransformer(method='yeo-johnson', standardize=False) + X = np.abs(X_2d)[:, 0:1] + + pt.lambdas_ = np.array([1]) + X_trans = pt.transform(X) + assert_array_almost_equal(X_trans, X) + + +@pytest.mark.parametrize("method, lmbda", [('box-cox', .1), + ('box-cox', .5), + ('yeo-johnson', .1), + ('yeo-johnson', .5), + ('yeo-johnson', 1.), + ]) +def test_optimization_power_transformer(method, lmbda): + # Test the optimization procedure: + # - set a predefined value for lambda + # - apply inverse_transform to a normal dist (we get X_inv) + # - apply fit_transform to X_inv (we get X_inv_trans) + # - check that X_inv_trans is roughly equal to X + + rng = np.random.RandomState(0) + n_samples = 20000 + X = rng.normal(loc=0, scale=1, size=(n_samples, 1)) + + pt = PowerTransformer(method=method, standardize=False) + pt.lambdas_ = [lmbda] + X_inv = pt.inverse_transform(X) + + pt = PowerTransformer(method=method, standardize=False) + X_inv_trans = pt.fit_transform(X_inv) + + assert_almost_equal(0, np.linalg.norm(X - X_inv_trans) / n_samples, + decimal=2) + assert_almost_equal(0, X_inv_trans.mean(), decimal=1) + assert_almost_equal(1, X_inv_trans.std(), decimal=1) + + +@pytest.mark.parametrize('method', ['box-cox', 'yeo-johnson']) +def test_power_transformer_nans(method): + # Make sure lambda estimation is not influenced by NaN values + # and that transform() supports NaN silently + + X = np.abs(X_1col) + pt = PowerTransformer(method=method) + pt.fit(X) + lmbda_no_nans = pt.lambdas_[0] + + # concat nans at the end and check lambda stays the same + X = np.concatenate([X, np.full_like(X, np.nan)]) + X = shuffle(X, random_state=0) + + pt.fit(X) + lmbda_nans = pt.lambdas_[0] + + assert_almost_equal(lmbda_no_nans, lmbda_nans, decimal=5) + + X_trans = pt.transform(X) + assert_array_equal(np.isnan(X_trans), np.isnan(X)) + + +@pytest.mark.parametrize('method', ['box-cox', 'yeo-johnson']) +@pytest.mark.parametrize('standardize', [True, False]) +def test_power_transformer_fit_transform(method, standardize): + # check that fit_transform() and fit().transform() return the same values + X = X_1col + if method == 'box-cox': + X = np.abs(X) + + pt = PowerTransformer(method, standardize) + assert_array_almost_equal(pt.fit(X).transform(X), pt.fit_transform(X)) + + +@pytest.mark.parametrize('method', ['box-cox', 'yeo-johnson']) +@pytest.mark.parametrize('standardize', [True, False]) +def test_power_transformer_copy_True(method, standardize): + # Check that neither fit, transform, fit_transform nor inverse_transform + # modify X inplace when copy=True + X = X_1col + if method == 'box-cox': + X = np.abs(X) + + X_original = X.copy() + assert X is not X_original # sanity checks + assert_array_almost_equal(X, X_original) + + pt = PowerTransformer(method, standardize, copy=True) + + pt.fit(X) + assert_array_almost_equal(X, X_original) + X_trans = pt.transform(X) + assert X_trans is not X + + X_trans = pt.fit_transform(X) + assert_array_almost_equal(X, X_original) + assert X_trans is not X + + X_inv_trans = pt.inverse_transform(X_trans) + assert X_trans is not X_inv_trans + + +@pytest.mark.parametrize('method', ['box-cox', 'yeo-johnson']) +@pytest.mark.parametrize('standardize', [True, False]) +def test_power_transformer_copy_False(method, standardize): + # check that when copy=False fit doesn't change X inplace but transform, + # fit_transform and inverse_transform do. + X = X_1col + if method == 'box-cox': + X = np.abs(X) + + X_original = X.copy() + assert X is not X_original # sanity checks + assert_array_almost_equal(X, X_original) + + pt = PowerTransformer(method, standardize, copy=False) + + pt.fit(X) + assert_array_almost_equal(X, X_original) # fit didn't change X + + X_trans = pt.transform(X) + assert X_trans is X + + if method == 'box-cox': + X = np.abs(X) + X_trans = pt.fit_transform(X) + assert X_trans is X + + X_inv_trans = pt.inverse_transform(X_trans) + assert X_trans is X_inv_trans diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 5149900c9c473..f15f8e5c86f91 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -896,9 +896,6 @@ def check_transformer_general(name, transformer, readonly_memmap=False): random_state=0, n_features=2, cluster_std=0.1) X = StandardScaler().fit_transform(X) X -= X.min() - if name == 'PowerTransformer': - # Box-Cox requires positive, non-zero data - X += 1 if readonly_memmap: X, y = create_memmap_backed_data([X, y]) @@ -1024,9 +1021,6 @@ def check_pipeline_consistency(name, estimator_orig): X, y = make_blobs(n_samples=30, centers=[[0, 0, 0], [1, 1, 1]], random_state=0, n_features=2, cluster_std=0.1) X -= X.min() - if name == 'PowerTransformer': - # Box-Cox requires positive, non-zero data - X += 1 X = pairwise_estimator_convert_X(X, estimator_orig, kernel=rbf_kernel) estimator = clone(estimator_orig) y = multioutput_estimator_convert_y_2d(estimator, y) @@ -1088,9 +1082,6 @@ def check_estimators_dtypes(name, estimator_orig): methods = ["predict", "transform", "decision_function", "predict_proba"] for X_train in [X_train_32, X_train_64, X_train_int_64, X_train_int_32]: - if name == 'PowerTransformer': - # Box-Cox requires positive, non-zero data - X_train = np.abs(X_train) + 1 estimator = clone(estimator_orig) set_random_state(estimator, 1) estimator.fit(X_train, y) @@ -1205,9 +1196,6 @@ def check_estimators_pickle(name, estimator_orig): # some estimators can't do features less than 0 X -= X.min() - if name == 'PowerTransformer': - # Box-Cox requires positive, non-zero data - X += 1 X = pairwise_estimator_convert_X(X, estimator_orig, kernel=rbf_kernel) # include NaN values when the estimator should deal with them @@ -1575,9 +1563,6 @@ def check_estimators_fit_returns_self(name, estimator_orig, X, y = make_blobs(random_state=0, n_samples=9, n_features=4) # some want non-negative input X -= X.min() - if name == 'PowerTransformer': - # Box-Cox requires positive, non-zero data - X += 1 X = pairwise_estimator_convert_X(X, estimator_orig) estimator = clone(estimator_orig) @@ -1939,9 +1924,6 @@ def check_estimators_overwrite_params(name, estimator_orig): X, y = make_blobs(random_state=0, n_samples=9) # some want non-negative input X -= X.min() - if name == 'PowerTransformer': - # Box-Cox requires positive, non-zero data - X += 1 X = pairwise_estimator_convert_X(X, estimator_orig, kernel=rbf_kernel) estimator = clone(estimator_orig) y = multioutput_estimator_convert_y_2d(estimator, y)