scikit-learn · jnothman · Dec 5, 2017 · May 14, 2016 · May 26, 2016 · Sep 15, 2016
diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
@@ -1200,6 +1200,7 @@ Model validation
    preprocessing.OneHotEncoder
    preprocessing.CategoricalEncoder
    preprocessing.PolynomialFeatures
+   preprocessing.PowerTransformer
    preprocessing.QuantileTransformer
    preprocessing.RobustScaler
    preprocessing.StandardScaler
@@ -1217,6 +1218,7 @@ Model validation
    preprocessing.quantile_transform
    preprocessing.robust_scale
    preprocessing.scale
+   preprocessing.power_transform
 
 
 .. _random_projection_ref:

diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst
@@ -261,6 +261,9 @@ defined by :math:`phi` followed by removal of the mean in that space.
 Non-linear transformation
 =========================
 
+Mapping to a Uniform distribution
+---------------------------------
+
 Like scalers, :class:`QuantileTransformer` puts all features into the same,
 known range or distribution. However, by performing a rank transformation, it
 smooths out unusual distributions and is less influenced by outliers than
@@ -299,8 +302,53 @@ This can be confirmed on a independent testing set with similar remarks::
   ... # doctest: +ELLIPSIS +SKIP
   array([ 0.01...,  0.25...,  0.46...,  0.60... ,  0.94...])
 
-It is also possible to map the transformed data to a normal distribution by
-setting ``output_distribution='normal'``::
+Mapping to a Gaussian distribution
+----------------------------------
+
+In many modeling scenarios, normality of the features in a dataset is desirable.
+Power transforms are a family of parametric, monotonic transformations that aim
+to map data from any distribution to as close to a Gaussian distribution as
+possible in order to stabilize variance and minimize skewness.
+
+:class:`PowerTransformer` currently provides one such power transformation,
+the Box-Cox transform. The Box-Cox transform is given by:
+
+.. math::
+    y_i^{(\lambda)} =
+    \begin{cases}
+    \dfrac{y_i^\lambda - 1}{\lambda} & \text{if } \lambda \neq 0, \\[8pt]
+    \ln{(y_i)} & \text{if } \lambda = 0,
+    \end{cases}
+
+Box-Cox can only be applied to strictly positive data. The transformation is
+parameterized by :math:`\lambda`, which is determined through maximum likelihood
+estimation. Here is an example of using Box-Cox to map samples drawn from a
+lognormal distribution to a normal distribution::
+
+  >>> pt = preprocessing.PowerTransformer(method='box-cox')
+  >>> X_lognormal = np.random.RandomState(616).lognormal(size=(3, 3))
+  >>> X_lognormal                                         # doctest: +ELLIPSIS
+  array([[ 1.28...,  1.18...,  0.84...],
+         [ 0.94...,  1.60...,  0.38...],
+         [ 1.35...,  0.21...,  1.09...]])
+  >>> pt.fit_transform(X_lognormal)                   # doctest: +ELLIPSIS
+  array([[ 0.49...,  0.17..., -0.15...],
+         [-0.05...,  0.58..., -0.57...],
+         [ 0.69..., -0.84...,  0.10...]])
+
+Below are examples of Box-Cox applied to various probability distributions.
+Note that when applied to certain distributions, Box-Cox achieves very
+Gaussian-like results, but with others, it is ineffective. This highlights
+the importance of visualizing the data before and after transformation.
+
+.. figure:: ../auto_examples/preprocessing/images/sphx_glr_plot_power_transformer_001.png
+   :target: ../auto_examples/preprocessing/plot_power_transformer.html
+   :align: center
+   :scale: 100
+
+It is also possible to map data to a normal distribution using
+:class:`QuantileTransformer` by setting ``output_distribution='normal'``.
+Using the earlier example with the iris dataset::
 
   >>> quantile_transformer = preprocessing.QuantileTransformer(
   ...     output_distribution='normal', random_state=0)

diff --git a/doc/whats_new/v0.20.rst b/doc/whats_new/v0.20.rst
@@ -60,6 +60,14 @@ Preprocessing
   the maximum value in the features. :issue:`9151` by
   :user:`Vighnesh Birodkar <vighneshbirodkar>` and `Joris Van den Bossche`_.
 
+- Added :class:`preprocessing.PowerTransformer`, which implements the Box-Cox
+  power transformation, allowing users to map data from any distribution to a
+  Gaussian distribution. This is useful as a variance-stabilizing transformation
+  in situations where normality and homoscedasticity are desirable.
+  :issue:`10210` by :user:`Eric Chang <ericchang00>` and
+  :user:`Maniteja Nandana <maniteja123>`.
+
+
 Model evaluation
 
 - Added the :func:`metrics.balanced_accuracy_score` metric and a corresponding
@@ -211,16 +219,14 @@ Feature Extraction
   throw an exception if ``max_patches`` was greater than or equal to the number
   of all possible patches rather than simply returning the number of possible
   patches. :issue:`10100` by :user:`Varun Agrawal <varunagrawal>`
-  
+
 - Fixed a bug in :class:`feature_extraction.text.CountVectorizer`,
   :class:`feature_extraction.text.TfidfVectorizer`,
   :class:`feature_extraction.text.HashingVectorizer` to support 64 bit sparse
   array indexing necessary to process large datasets with more than 2·10⁹ tokens
   (words or n-grams). :issue:`9147` by :user:`Claes-Fredrik Mannby <mannby>`
   and `Roman Yurchak`_.
 
-
-
 API changes summary
 -------------------
 

diff --git a/examples/preprocessing/plot_all_scaling.py b/examples/preprocessing/plot_all_scaling.py
@@ -29,8 +29,10 @@
 other in the way to estimate the parameters used to shift and scale each
 feature.
 
-``QuantileTransformer`` provides a non-linear transformation in which distances
-between marginal outliers and inliers are shrunk.
+``QuantileTransformer`` provides non-linear transformations in which distances
+between marginal outliers and inliers are shrunk. ``PowerTransformer`` provides
+non-linear transformations in which data is mapped to a normal distribution to
+stabilize variance and minimize skewness.
 
 Unlike the previous transformations, normalization refers to a per sample
 transformation instead of a per feature transformation.
@@ -59,7 +61,8 @@
 from sklearn.preprocessing import StandardScaler
 from sklearn.preprocessing import RobustScaler
 from sklearn.preprocessing import Normalizer
-from sklearn.preprocessing.data import QuantileTransformer
+from sklearn.preprocessing import QuantileTransformer
+from sklearn.preprocessing import PowerTransformer
 
 from sklearn.datasets import fetch_california_housing
 
@@ -84,14 +87,16 @@
         MaxAbsScaler().fit_transform(X)),
     ('Data after robust scaling',
         RobustScaler(quantile_range=(25, 75)).fit_transform(X)),
-    ('Data after quantile transformation (uniform pdf)',
-        QuantileTransformer(output_distribution='uniform')
-        .fit_transform(X)),
+    ('Data after power transformation (Box-Cox)',
+     PowerTransformer(method='box-cox').fit_transform(X)),
     ('Data after quantile transformation (gaussian pdf)',
         QuantileTransformer(output_distribution='normal')
         .fit_transform(X)),
+    ('Data after quantile transformation (uniform pdf)',
+        QuantileTransformer(output_distribution='uniform')
+        .fit_transform(X)),
     ('Data after sample-wise L2 normalizing',
-        Normalizer().fit_transform(X))
+        Normalizer().fit_transform(X)),
 ]
 
 # scale the output between 0 and 1 for the colorbar
@@ -286,6 +291,35 @@ def make_plot(item_idx):
 
 make_plot(4)
 
+##############################################################################
+# PowerTransformer (Box-Cox)
+# --------------------------
+#
+# ``PowerTransformer`` applies a power transformation to each
+# feature to make the data more Gaussian-like. Currently,
+# ``PowerTransformer`` implements the Box-Cox transform. It differs from
+# QuantileTransformer (Gaussian output) in that it does not map the
+# data to a zero-mean, unit-variance Gaussian distribution. Instead, Box-Cox
+# finds the optimal scaling factor to stabilize variance and mimimize skewness
+# through maximum likelihood estimation. Note that Box-Cox can only be applied
+# to positive, non-zero data. Income and number of households happen to be
+# strictly positive, but if negative values are present, a constant can be
+# added to each feature to shift it into the positive range - this is known as
+# the two-parameter Box-Cox transform.
+
+make_plot(5)
+
+##############################################################################
+# QuantileTransformer (Gaussian output)
+# -------------------------------------
+#
+# ``QuantileTransformer`` has an additional ``output_distribution`` parameter
+# allowing to match a Gaussian distribution instead of a uniform distribution.
+# Note that this non-parametetric transformer introduces saturation artifacts
+# for extreme values.
+
+make_plot(6)
+
 ###################################################################
 # QuantileTransformer (uniform output)
 # ------------------------------------
@@ -302,18 +336,7 @@ def make_plot(item_idx):
 # any outlier by setting them to the a priori defined range boundaries (0 and
 # 1).
 
-make_plot(5)
-
-##############################################################################
-# QuantileTransformer (Gaussian output)
-# -------------------------------------
-#
-# ``QuantileTransformer`` has an additional ``output_distribution`` parameter
-# allowing to match a Gaussian distribution instead of a uniform distribution.
-# Note that this non-parametetric transformer introduces saturation artifacts
-# for extreme values.
-
-make_plot(6)
+make_plot(7)
 
 ##############################################################################
 # Normalizer
@@ -326,5 +349,6 @@ def make_plot(item_idx):
 # transformed data only lie in the positive quadrant. This would not be the
 # case if some original features had a mix of positive and negative values.
 
-make_plot(7)
+make_plot(8)
+
 plt.show()
diff --git a/examples/preprocessing/plot_power_transformer.py b/examples/preprocessing/plot_power_transformer.py
@@ -0,0 +1,107 @@
+"""
+==========================================================
+Using PowerTransformer to apply the Box-Cox transformation
+==========================================================
+
+This example demonstrates the use of the Box-Cox transform through
+:class:`preprocessing.PowerTransformer` to map data from various distributions
+to a normal distribution.
+
+Box-Cox is useful as a transformation in modeling problems where
+homoscedasticity and normality are desired. Below are examples of Box-Cox
+applied to six different probability distributions: Lognormal, Chi-squared,
+Weibull, Gaussian, Uniform, and Bimodal.
+
+Note that the transformation successfully maps the data to a normal
+distribution when applied to certain datasets, but is ineffective with others.
+This highlights the importance of visualizing the data before and after
+transformation.
+"""
+
+# Author: Eric Chang <[email protected]>
+# License: BSD 3 clause
+
+import numpy as np
+import matplotlib.pyplot as plt
+
+from sklearn.preprocessing import PowerTransformer, minmax_scale
+
+print(__doc__)
+
+
+N_SAMPLES = 3000
+FONT_SIZE = 6
+BINS = 100
+
+
+pt = PowerTransformer(method='box-cox')
+rng = np.random.RandomState(304)
+size = (N_SAMPLES, 1)
+
+
+# lognormal distribution
+X_lognormal = rng.lognormal(size=size)
+
+# chi-squared distribution
+df = 3
+X_chisq = rng.chisquare(df=df, size=size)
+
+# weibull distribution
+a = 50
+X_weibull = rng.weibull(a=a, size=size)
+
+# gaussian distribution
+loc = 100
+X_gaussian = rng.normal(loc=loc, size=size)
+
+# uniform distirbution
+X_uniform = rng.uniform(low=0, high=1, size=size)
+
+# bimodal distribution
+loc_a, loc_b = 100, 105
+X_a, X_b = rng.normal(loc=loc_a, size=size), rng.normal(loc=loc_b, size=size)
+X_bimodal = np.concatenate([X_a, X_b], axis=0)
+
+
+# create plots
+distributions = [
+    ('Lognormal', X_lognormal),
+    ('Chi-squared', X_chisq),
+    ('Weibull', X_weibull),
+    ('Gaussian', X_gaussian),
+    ('Uniform', X_uniform),
+    ('Bimodal', X_bimodal)
+]
+
+colors = ['firebrick', 'darkorange', 'goldenrod',
+          'seagreen', 'royalblue', 'darkorchid']
+
+fig, axes = plt.subplots(nrows=4, ncols=3)
+axes = axes.flatten()
+axes_idxs = [(0, 3), (1, 4), (2, 5), (6, 9), (7, 10), (8, 11)]
+axes_list = [(axes[i], axes[j]) for i, j in axes_idxs]
+
+
+for distribution, color, axes in zip(distributions, colors, axes_list):
+    name, X = distribution
+    # scale all distributions to the range [0, 10]
+    X = minmax_scale(X, feature_range=(1e-10, 10))
+
+    # perform power transform
+    X_trans = pt.fit_transform(X)
+    lmbda = round(pt.lambdas_[0], 2)
+
+    ax_original, ax_trans = axes
+
+    ax_original.hist(X, color=color, bins=BINS)
+    ax_original.set_title(name, fontsize=FONT_SIZE)
+    ax_original.tick_params(axis='both', which='major', labelsize=FONT_SIZE)
+
+    ax_trans.hist(X_trans, color=color, bins=BINS)
+    ax_trans.set_title('{} after Box-Cox, $\lambda$ = {}'.format(name, lmbda),
+                       fontsize=FONT_SIZE)
+    ax_trans.tick_params(axis='both', which='major', labelsize=FONT_SIZE)
+
+
+plt.tight_layout()
+plt.show()
diff --git a/sklearn/preprocessing/__init__.py b/sklearn/preprocessing/__init__.py
@@ -21,7 +21,9 @@
 from .data import maxabs_scale
 from .data import minmax_scale
 from .data import quantile_transform
+from .data import power_transform
 from .data import OneHotEncoder
+from .data import PowerTransformer
 from .data import CategoricalEncoder
 
 from .data import PolynomialFeatures
@@ -48,6 +50,7 @@
     'Normalizer',
     'OneHotEncoder',
     'CategoricalEncoder',
+    'PowerTransformer',
     'RobustScaler',
     'StandardScaler',
     'add_dummy_feature',
@@ -60,4 +63,5 @@
     'minmax_scale',
     'label_binarize',
     'quantile_transform',
+    'power_transform',
 ]