diff --git a/doc/glossary.rst b/doc/glossary.rst
index 533a8eac63d04..37e96a7d394f7 100644
--- a/doc/glossary.rst
+++ b/doc/glossary.rst
@@ -294,7 +294,7 @@ General Concepts
         convergence of the training loss, to avoid over-fitting. This is
         generally done by monitoring the generalization score on a validation
         set. When available, it is activated through the parameter
-        ``early_stopping`` or by setting a postive :term:`n_iter_no_change`.
+        ``early_stopping`` or by setting a positive :term:`n_iter_no_change`.
 
     estimator instance
         We sometimes use this terminology to distinguish an :term:`estimator`
diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst
index 70f24def72f20..dd1f798ccb3aa 100644
--- a/doc/modules/preprocessing.rst
+++ b/doc/modules/preprocessing.rst
@@ -309,20 +309,34 @@ Power transforms are a family of parametric, monotonic transformations that aim
 to map data from any distribution to as close to a Gaussian distribution as
 possible in order to stabilize variance and minimize skewness.
 
-:class:`PowerTransformer` currently provides one such power transformation,
-the Box-Cox transform. The Box-Cox transform is given by:
+:class:`PowerTransformer` currently provides two such power transformations,
+the Yeo-Johnson transform and the Box-Cox transform.
+
+The Yeo-Johnson transform is given by:
 
 .. math::
-    y_i^{(\lambda)} =
+    x_i^{(\lambda)} =
     \begin{cases}
-    \dfrac{y_i^\lambda - 1}{\lambda} & \text{if } \lambda \neq 0, \\[8pt]
-    \ln{(y_i)} & \text{if } \lambda = 0,
+     [(x_i + 1)^\lambda - 1] / \lambda & \text{if } \lambda \neq 0, x_i \geq 0, \\[8pt]
+    \ln{(x_i) + 1} & \text{if } \lambda = 0, x_i \geq 0 \\[8pt]
+    -[(-x_i + 1)^{2 - \lambda} - 1] / (2 - \lambda) & \text{if } \lambda \neq 2, x_i < 0, \\[8pt]
+     - \ln (- x_i + 1) & \text{if } \lambda = 2, x_i < 0
     \end{cases}
 
-Box-Cox can only be applied to strictly positive data. The transformation is
-parameterized by :math:`\lambda`, which is determined through maximum likelihood
-estimation. Here is an example of using Box-Cox to map samples drawn from a
-lognormal distribution to a normal distribution::
+while the Box-Cox transform is given by:
+
+.. math::
+    x_i^{(\lambda)} =
+    \begin{cases}
+    \dfrac{x_i^\lambda - 1}{\lambda} & \text{if } \lambda \neq 0, \\[8pt]
+    \ln{(x_i)} & \text{if } \lambda = 0,
+    \end{cases}
+
+
+Box-Cox can only be applied to strictly positive data. In both methods, the
+transformation is parameterized by :math:`\lambda`, which is determined through
+maximum likelihood estimation. Here is an example of using Box-Cox to map
+samples drawn from a lognormal distribution to a normal distribution::
 
   >>> pt = preprocessing.PowerTransformer(method='box-cox', standardize=False)
   >>> X_lognormal = np.random.RandomState(616).lognormal(size=(3, 3))
@@ -339,13 +353,14 @@ While the above example sets the `standardize` option to `False`,
 :class:`PowerTransformer` will apply zero-mean, unit-variance normalization
 to the transformed output by default.
 
-Below are examples of Box-Cox applied to various probability distributions.
-Note that when applied to certain distributions, Box-Cox achieves very
-Gaussian-like results, but with others, it is ineffective. This highlights
-the importance of visualizing the data before and after transformation.
+Below are examples of Box-Cox and Yeo-Johnson applied to various probability
+distributions.  Note that when applied to certain distributions, the power
+transforms achieve very Gaussian-like results, but with others, they are
+ineffective. This highlights the importance of visualizing the data before and
+after transformation.
 
-.. figure:: ../auto_examples/preprocessing/images/sphx_glr_plot_power_transformer_001.png
-   :target: ../auto_examples/preprocessing/plot_power_transformer.html
+.. figure:: ../auto_examples/preprocessing/images/sphx_glr_plot_map_data_to_normal_001.png
+   :target: ../auto_examples/preprocessing/plot_map_data_to_normal.html
    :align: center
    :scale: 100
 
diff --git a/doc/whats_new/v0.20.rst b/doc/whats_new/v0.20.rst
index 86c8d7a8ddab7..5b4a239368caf 100644
--- a/doc/whats_new/v0.20.rst
+++ b/doc/whats_new/v0.20.rst
@@ -136,12 +136,15 @@ Preprocessing
   DataFrames. :issue:`9012` by `Andreas Müller`_ and `Joris Van den Bossche`_,
   and :issue:`11315` by :user:`Thomas Fan <thomasjpfan>`.
 
-- Added :class:`preprocessing.PowerTransformer`, which implements the Box-Cox
-  power transformation, allowing users to map data from any distribution to a
-  Gaussian distribution. This is useful as a variance-stabilizing transformation
-  in situations where normality and homoscedasticity are desirable.
+- Added :class:`preprocessing.PowerTransformer`, which implements the
+  Yeo-Johnson and Box-Cox power transformations. Power transformations try to
+  find a set of feature-wise parametric transformations to approximately map
+  data to a Gaussian distribution centered at zero and with unit variance.
+  This is useful as a variance-stabilizing transformation in situations where
+  normality and homoscedasticity are desirable.
   :issue:`10210` by :user:`Eric Chang <ericchang00>` and
-  :user:`Maniteja Nandana <maniteja123>`.
+  :user:`Maniteja Nandana <maniteja123>`, and :issue:`11520` by :user:`Nicolas
+  Hug <nicolashug>`.
 
 - Added the :class:`compose.TransformedTargetRegressor` which transforms
   the target y before fitting a regression model. The predictions are mapped
diff --git a/examples/preprocessing/plot_all_scaling.py b/examples/preprocessing/plot_all_scaling.py
index 92cd635e2a06d..07fd3662da448 100755
--- a/examples/preprocessing/plot_all_scaling.py
+++ b/examples/preprocessing/plot_all_scaling.py
@@ -87,6 +87,8 @@
         MaxAbsScaler().fit_transform(X)),
     ('Data after robust scaling',
         RobustScaler(quantile_range=(25, 75)).fit_transform(X)),
+    ('Data after power transformation (Yeo-Johnson)',
+     PowerTransformer(method='yeo-johnson').fit_transform(X)),
     ('Data after power transformation (Box-Cox)',
      PowerTransformer(method='box-cox').fit_transform(X)),
     ('Data after quantile transformation (gaussian pdf)',
@@ -294,21 +296,21 @@ def make_plot(item_idx):
 make_plot(4)
 
 ##############################################################################
-# PowerTransformer (Box-Cox)
-# --------------------------
+# PowerTransformer
+# ----------------
 #
-# ``PowerTransformer`` applies a power transformation to each
-# feature to make the data more Gaussian-like. Currently,
-# ``PowerTransformer`` implements the Box-Cox transform. The Box-Cox transform
-# finds the optimal scaling factor to stabilize variance and mimimize skewness
-# through maximum likelihood estimation. By default, ``PowerTransformer`` also
-# applies zero-mean, unit variance normalization to the transformed output.
-# Note that Box-Cox can only be applied to positive, non-zero data. Income and
-# number of households happen to be strictly positive, but if negative values
-# are present, a constant can be added to each feature to shift it into the
-# positive range - this is known as the two-parameter Box-Cox transform.
+# ``PowerTransformer`` applies a power transformation to each feature to make
+# the data more Gaussian-like. Currently, ``PowerTransformer`` implements the
+# Yeo-Johnson and Box-Cox transforms. The power transform finds the optimal
+# scaling factor to stabilize variance and mimimize skewness through maximum
+# likelihood estimation. By default, ``PowerTransformer`` also applies
+# zero-mean, unit variance normalization to the transformed output. Note that
+# Box-Cox can only be applied to strictly positive data. Income and number of
+# households happen to be strictly positive, but if negative values are present
+# the Yeo-Johnson transformed is to be preferred.
 
 make_plot(5)
+make_plot(6)
 
 ##############################################################################
 # QuantileTransformer (Gaussian output)
@@ -319,7 +321,7 @@ def make_plot(item_idx):
 # Note that this non-parametetric transformer introduces saturation artifacts
 # for extreme values.
 
-make_plot(6)
+make_plot(7)
 
 ###################################################################
 # QuantileTransformer (uniform output)
@@ -337,7 +339,7 @@ def make_plot(item_idx):
 # any outlier by setting them to the a priori defined range boundaries (0 and
 # 1).
 
-make_plot(7)
+make_plot(8)
 
 ##############################################################################
 # Normalizer
@@ -350,6 +352,6 @@ def make_plot(item_idx):
 # transformed data only lie in the positive quadrant. This would not be the
 # case if some original features had a mix of positive and negative values.
 
-make_plot(8)
+make_plot(9)
 
 plt.show()
diff --git a/examples/preprocessing/plot_map_data_to_normal.py b/examples/preprocessing/plot_map_data_to_normal.py
new file mode 100644
index 0000000000000..b8b7625f3c02b
--- /dev/null
+++ b/examples/preprocessing/plot_map_data_to_normal.py
@@ -0,0 +1,137 @@
+"""
+=================================
+Map data to a normal distribution
+=================================
+
+This example demonstrates the use of the Box-Cox and Yeo-Johnson transforms
+through :class:`preprocessing.PowerTransformer` to map data from various
+distributions to a normal distribution.
+
+The power transform is useful as a transformation in modeling problems where
+homoscedasticity and normality are desired. Below are examples of Box-Cox and
+Yeo-Johnwon applied to six different probability distributions: Lognormal,
+Chi-squared, Weibull, Gaussian, Uniform, and Bimodal.
+
+Note that the transformations successfully map the data to a normal
+distribution when applied to certain datasets, but are ineffective with others.
+This highlights the importance of visualizing the data before and after
+transformation.
+
+Also note that even though Box-Cox seems to perform better than Yeo-Johnson for
+lognormal and chi-squared distributions, keep in mind that Box-Cox does not
+support inputs with negative values.
+
+For comparison, we also add the output from
+:class:`preprocessing.QuantileTransformer`. It can force any arbitrary
+distribution into a gaussian, provided that there are enough training samples
+(thousands). Because it is a non-parametric method, it is harder to interpret
+than the parametric ones (Box-Cox and Yeo-Johnson).
+
+On "small" datasets (less than a few hundred points), the quantile transformer
+is prone to overfitting. The use of the power transform is then recommended.
+"""
+
+# Author: Eric Chang <ericchang2017@u.northwestern.edu>
+#         Nicolas Hug <contact@nicolas-hug.com>
+# License: BSD 3 clause
+
+import numpy as np
+import matplotlib.pyplot as plt
+
+from sklearn.preprocessing import PowerTransformer
+from sklearn.preprocessing import QuantileTransformer
+from sklearn.model_selection import train_test_split
+
+print(__doc__)
+
+
+N_SAMPLES = 1000
+FONT_SIZE = 6
+BINS = 30
+
+
+rng = np.random.RandomState(304)
+bc = PowerTransformer(method='box-cox')
+yj = PowerTransformer(method='yeo-johnson')
+qt = QuantileTransformer(output_distribution='normal', random_state=rng)
+size = (N_SAMPLES, 1)
+
+
+# lognormal distribution
+X_lognormal = rng.lognormal(size=size)
+
+# chi-squared distribution
+df = 3
+X_chisq = rng.chisquare(df=df, size=size)
+
+# weibull distribution
+a = 50
+X_weibull = rng.weibull(a=a, size=size)
+
+# gaussian distribution
+loc = 100
+X_gaussian = rng.normal(loc=loc, size=size)
+
+# uniform distribution
+X_uniform = rng.uniform(low=0, high=1, size=size)
+
+# bimodal distribution
+loc_a, loc_b = 100, 105
+X_a, X_b = rng.normal(loc=loc_a, size=size), rng.normal(loc=loc_b, size=size)
+X_bimodal = np.concatenate([X_a, X_b], axis=0)
+
+
+# create plots
+distributions = [
+    ('Lognormal', X_lognormal),
+    ('Chi-squared', X_chisq),
+    ('Weibull', X_weibull),
+    ('Gaussian', X_gaussian),
+    ('Uniform', X_uniform),
+    ('Bimodal', X_bimodal)
+]
+
+colors = ['firebrick', 'darkorange', 'goldenrod',
+          'seagreen', 'royalblue', 'darkorchid']
+
+fig, axes = plt.subplots(nrows=8, ncols=3, figsize=plt.figaspect(2))
+axes = axes.flatten()
+axes_idxs = [(0, 3, 6, 9), (1, 4, 7, 10), (2, 5, 8, 11), (12, 15, 18, 21),
+             (13, 16, 19, 22), (14, 17, 20, 23)]
+axes_list = [(axes[i], axes[j], axes[k], axes[l])
+             for (i, j, k, l) in axes_idxs]
+
+
+for distribution, color, axes in zip(distributions, colors, axes_list):
+    name, X = distribution
+    X_train, X_test = train_test_split(X, test_size=.5)
+
+    # perform power transforms and quantile transform
+    X_trans_bc = bc.fit(X_train).transform(X_test)
+    lmbda_bc = round(bc.lambdas_[0], 2)
+    X_trans_yj = yj.fit(X_train).transform(X_test)
+    lmbda_yj = round(yj.lambdas_[0], 2)
+    X_trans_qt = qt.fit(X_train).transform(X_test)
+
+    ax_original, ax_bc, ax_yj, ax_qt = axes
+
+    ax_original.hist(X_train, color=color, bins=BINS)
+    ax_original.set_title(name, fontsize=FONT_SIZE)
+    ax_original.tick_params(axis='both', which='major', labelsize=FONT_SIZE)
+
+    for ax, X_trans, meth_name, lmbda in zip(
+            (ax_bc, ax_yj, ax_qt),
+            (X_trans_bc, X_trans_yj, X_trans_qt),
+            ('Box-Cox', 'Yeo-Johnson', 'Quantile transform'),
+            (lmbda_bc, lmbda_yj, None)):
+        ax.hist(X_trans, color=color, bins=BINS)
+        title = 'After {}'.format(meth_name)
+        if lmbda is not None:
+            title += '\n$\lambda$ = {}'.format(lmbda)
+        ax.set_title(title, fontsize=FONT_SIZE)
+        ax.tick_params(axis='both', which='major', labelsize=FONT_SIZE)
+        ax.set_xlim([-3.5, 3.5])
+
+
+plt.tight_layout()
+plt.show()
diff --git a/examples/preprocessing/plot_power_transformer.py b/examples/preprocessing/plot_power_transformer.py
deleted file mode 100644
index 52ce0d3121f73..0000000000000
--- a/examples/preprocessing/plot_power_transformer.py
+++ /dev/null
@@ -1,109 +0,0 @@
-"""
-==========================================================
-Using PowerTransformer to apply the Box-Cox transformation
-==========================================================
-
-This example demonstrates the use of the Box-Cox transform through
-:class:`preprocessing.PowerTransformer` to map data from various distributions
-to a normal distribution.
-
-Box-Cox is useful as a transformation in modeling problems where
-homoscedasticity and normality are desired. Below are examples of Box-Cox
-applied to six different probability distributions: Lognormal, Chi-squared,
-Weibull, Gaussian, Uniform, and Bimodal.
-
-Note that the transformation successfully maps the data to a normal
-distribution when applied to certain datasets, but is ineffective with others.
-This highlights the importance of visualizing the data before and after
-transformation. Also note that while the standardize option is set to False for
-the plot examples, by default, :class:`preprocessing.PowerTransformer` also
-applies zero-mean, unit-variance standardization to the transformed outputs.
-"""
-
-# Author: Eric Chang <ericchang2017@u.northwestern.edu>
-# License: BSD 3 clause
-
-import numpy as np
-import matplotlib.pyplot as plt
-
-from sklearn.preprocessing import PowerTransformer, minmax_scale
-
-print(__doc__)
-
-
-N_SAMPLES = 3000
-FONT_SIZE = 6
-BINS = 100
-
-
-pt = PowerTransformer(method='box-cox', standardize=False)
-rng = np.random.RandomState(304)
-size = (N_SAMPLES, 1)
-
-
-# lognormal distribution
-X_lognormal = rng.lognormal(size=size)
-
-# chi-squared distribution
-df = 3
-X_chisq = rng.chisquare(df=df, size=size)
-
-# weibull distribution
-a = 50
-X_weibull = rng.weibull(a=a, size=size)
-
-# gaussian distribution
-loc = 100
-X_gaussian = rng.normal(loc=loc, size=size)
-
-# uniform distribution
-X_uniform = rng.uniform(low=0, high=1, size=size)
-
-# bimodal distribution
-loc_a, loc_b = 100, 105
-X_a, X_b = rng.normal(loc=loc_a, size=size), rng.normal(loc=loc_b, size=size)
-X_bimodal = np.concatenate([X_a, X_b], axis=0)
-
-
-# create plots
-distributions = [
-    ('Lognormal', X_lognormal),
-    ('Chi-squared', X_chisq),
-    ('Weibull', X_weibull),
-    ('Gaussian', X_gaussian),
-    ('Uniform', X_uniform),
-    ('Bimodal', X_bimodal)
-]
-
-colors = ['firebrick', 'darkorange', 'goldenrod',
-          'seagreen', 'royalblue', 'darkorchid']
-
-fig, axes = plt.subplots(nrows=4, ncols=3)
-axes = axes.flatten()
-axes_idxs = [(0, 3), (1, 4), (2, 5), (6, 9), (7, 10), (8, 11)]
-axes_list = [(axes[i], axes[j]) for i, j in axes_idxs]
-
-
-for distribution, color, axes in zip(distributions, colors, axes_list):
-    name, X = distribution
-    # scale all distributions to the range [0, 10]
-    X = minmax_scale(X, feature_range=(1e-10, 10))
-
-    # perform power transform
-    X_trans = pt.fit_transform(X)
-    lmbda = round(pt.lambdas_[0], 2)
-
-    ax_original, ax_trans = axes
-
-    ax_original.hist(X, color=color, bins=BINS)
-    ax_original.set_title(name, fontsize=FONT_SIZE)
-    ax_original.tick_params(axis='both', which='major', labelsize=FONT_SIZE)
-
-    ax_trans.hist(X_trans, color=color, bins=BINS)
-    ax_trans.set_title('{} after Box-Cox, $\lambda$ = {}'.format(name, lmbda),
-                       fontsize=FONT_SIZE)
-    ax_trans.tick_params(axis='both', which='major', labelsize=FONT_SIZE)
-
-
-plt.tight_layout()
-plt.show()
diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py
index 2b17f41010eeb..1256b6522e928 100644
--- a/sklearn/preprocessing/data.py
+++ b/sklearn/preprocessing/data.py
@@ -17,6 +17,7 @@
 import numpy as np
 from scipy import sparse
 from scipy import stats
+from scipy import optimize
 
 from ..base import BaseEstimator, TransformerMixin
 from ..externals import six
@@ -2400,10 +2401,12 @@ class PowerTransformer(BaseEstimator, TransformerMixin):
     modeling issues related to heteroscedasticity (non-constant variance),
     or other situations where normality is desired.
 
-    Currently, PowerTransformer supports the Box-Cox transform. Box-Cox
-    requires input data to be strictly positive. The optimal parameter
-    for stabilizing variance and minimizing skewness is estimated through
-    maximum likelihood.
+    Currently, PowerTransformer supports the Box-Cox transform and the
+    Yeo-Johson transform. The optimal parameter for stabilizing variance and
+    minimizing skewness is estimated through maximum likelihood.
+
+    Box-Cox requires input data to be strictly positive, while Yeo-Johnson
+    supports both positive or negative data.
 
     By default, zero-mean, unit-variance normalization is applied to the
     transformed data.
@@ -2412,9 +2415,11 @@ class PowerTransformer(BaseEstimator, TransformerMixin):
 
     Parameters
     ----------
-    method : str, (default='box-cox')
-        The power transform method. Currently, 'box-cox' (Box-Cox transform)
-        is the only option available.
+    method : str, (default='yeo-johnson')
+        The power transform method. Available methods are:
+
+        - 'yeo-johnson' [1]_, works with positive and negative values
+        - 'box-cox' [2]_, only works with strictly positive values
 
     standardize : boolean, default=True
         Set to True to apply zero-mean, unit-variance normalization to the
@@ -2435,13 +2440,13 @@ class PowerTransformer(BaseEstimator, TransformerMixin):
     >>> pt = PowerTransformer()
     >>> data = [[1, 2], [3, 2], [4, 5]]
     >>> print(pt.fit(data))
-    PowerTransformer(copy=True, method='box-cox', standardize=True)
-    >>> print(pt.lambdas_)  # doctest: +ELLIPSIS
-    [ 1.051... -2.345...]
-    >>> print(pt.transform(data))  # doctest: +ELLIPSIS
-    [[-1.332... -0.707...]
-     [ 0.256... -0.707...]
-     [ 1.076...  1.414...]]
+    PowerTransformer(copy=True, method='yeo-johnson', standardize=True)
+    >>> print(pt.lambdas_)
+    [1.38668178e+00 5.93926346e-09]
+    >>> print(pt.transform(data))
+    [[-1.31616039 -0.70710678]
+     [ 0.20998268 -0.70710678]
+     [ 1.1061777   1.41421356]]
 
     See also
     --------
@@ -2461,21 +2466,24 @@ class PowerTransformer(BaseEstimator, TransformerMixin):
 
     References
     ----------
-    G.E.P. Box and D.R. Cox, "An Analysis of Transformations", Journal of the
-    Royal Statistical Society B, 26, 211-252 (1964).
 
+    .. [1] I.K. Yeo and R.A. Johnson, "A new family of power transformations to
+           improve normality or symmetry." Biometrika, 87(4), pp.954-959,
+           (2000).
+
+    .. [2] G.E.P. Box and D.R. Cox, "An Analysis of Transformations", Journal
+           of the Royal Statistical Society B, 26, 211-252 (1964).
     """
-    def __init__(self, method='box-cox', standardize=True, copy=True):
+    def __init__(self, method='yeo-johnson', standardize=True, copy=True):
         self.method = method
         self.standardize = standardize
         self.copy = copy
 
     def fit(self, X, y=None):
-        """Estimate the optimal parameter for each feature.
+        """Estimate the optimal parameter lambda for each feature.
 
-        The optimal parameter for minimizing skewness is estimated
-        on each feature independently. If the method is Box-Cox,
-        the lambdas are estimated using maximum likelihood.
+        The optimal lambda parameter for minimizing skewness is estimated on
+        each feature independently using maximum likelihood.
 
         Parameters
         ----------
@@ -2488,27 +2496,44 @@ def fit(self, X, y=None):
         -------
         self : object
         """
+        self._fit(X, y=y, force_transform=False)
+        return self
+
+    def fit_transform(self, X, y=None):
+        return self._fit(X, y, force_transform=True)
+
+    def _fit(self, X, y=None, force_transform=False):
         X = self._check_input(X, check_positive=True, check_method=True)
 
-        self.lambdas_ = []
-        transformed = []
+        if not self.copy and not force_transform:  # if call from fit()
+            X = X.copy()  # force copy so that fit does not change X inplace
 
+        optim_function = {'box-cox': self._box_cox_optimize,
+                          'yeo-johnson': self._yeo_johnson_optimize
+                          }[self.method]
+        self.lambdas_ = []
         for col in X.T:
-            # the computation of lambda is influenced by NaNs and we need to
-            # get rid of them to compute them.
-            _, lmbda = stats.boxcox(col[~np.isnan(col)], lmbda=None)
-            col_trans = boxcox(col, lmbda)
-            self.lambdas_.append(lmbda)
-            transformed.append(col_trans)
-
+            with np.errstate(invalid='ignore'):  # hide NaN warnings
+                lmbda = optim_function(col)
+                self.lambdas_.append(lmbda)
         self.lambdas_ = np.array(self.lambdas_)
-        transformed = np.array(transformed)
+
+        if self.standardize or force_transform:
+            transform_function = {'box-cox': boxcox,
+                                  'yeo-johnson': self._yeo_johnson_transform
+                                  }[self.method]
+            for i, lmbda in enumerate(self.lambdas_):
+                with np.errstate(invalid='ignore'):  # hide NaN warnings
+                    X[:, i] = transform_function(X[:, i], lmbda)
 
         if self.standardize:
-            self._scaler = StandardScaler()
-            self._scaler.fit(X=transformed.T)
+            self._scaler = StandardScaler(copy=False)
+            if force_transform:
+                X = self._scaler.fit_transform(X)
+            else:
+                self._scaler.fit(X)
 
-        return self
+        return X
 
     def transform(self, X):
         """Apply the power transform to each feature using the fitted lambdas.
@@ -2517,12 +2542,21 @@ def transform(self, X):
         ----------
         X : array-like, shape (n_samples, n_features)
             The data to be transformed using a power transformation.
+
+        Returns
+        -------
+        X_trans : array-like, shape (n_samples, n_features)
+            The transformed data.
         """
         check_is_fitted(self, 'lambdas_')
         X = self._check_input(X, check_positive=True, check_shape=True)
 
+        transform_function = {'box-cox': boxcox,
+                              'yeo-johnson': self._yeo_johnson_transform
+                              }[self.method]
         for i, lmbda in enumerate(self.lambdas_):
-            X[:, i] = boxcox(X[:, i], lmbda)
+            with np.errstate(invalid='ignore'):  # hide NaN warnings
+                X[:, i] = transform_function(X[:, i], lmbda)
 
         if self.standardize:
             X = self._scaler.transform(X)
@@ -2539,10 +2573,26 @@ def inverse_transform(self, X):
             else:
                 X = (X_trans * lambda + 1) ** (1 / lambda)
 
+        The inverse of the Yeo-Johnson transformation is given by::
+
+            if X >= 0 and lambda == 0:
+                X = exp(X_trans) - 1
+            elif X >= 0 and lambda != 0:
+                X = (X_trans * lambda + 1) ** (1 / lambda) - 1
+            elif X < 0 and lambda != 2:
+                X = 1 - (-(2 - lambda) * X_trans + 1) ** (1 / (2 - lambda))
+            elif X < 0 and lambda == 2:
+                X = 1 - exp(-X_trans)
+
         Parameters
         ----------
         X : array-like, shape (n_samples, n_features)
             The transformed data.
+
+        Returns
+        -------
+        X : array-like, shape (n_samples, n_features)
+            The original data
         """
         check_is_fitted(self, 'lambdas_')
         X = self._check_input(X, check_shape=True)
@@ -2550,16 +2600,120 @@ def inverse_transform(self, X):
         if self.standardize:
             X = self._scaler.inverse_transform(X)
 
+        inv_fun = {'box-cox': self._box_cox_inverse_tranform,
+                   'yeo-johnson': self._yeo_johnson_inverse_transform
+                   }[self.method]
         for i, lmbda in enumerate(self.lambdas_):
-            x = X[:, i]
-            if lmbda == 0:
-                x_inv = np.exp(x)
-            else:
-                x_inv = (x * lmbda + 1) ** (1 / lmbda)
-            X[:, i] = x_inv
+            with np.errstate(invalid='ignore'):  # hide NaN warnings
+                X[:, i] = inv_fun(X[:, i], lmbda)
 
         return X
 
+    def _box_cox_inverse_tranform(self, x, lmbda):
+        """Return inverse-transformed input x following Box-Cox inverse
+        transform with parameter lambda.
+        """
+        if lmbda == 0:
+            x_inv = np.exp(x)
+        else:
+            x_inv = (x * lmbda + 1) ** (1 / lmbda)
+
+        return x_inv
+
+    def _yeo_johnson_inverse_transform(self, x, lmbda):
+        """Return inverse-transformed input x following Yeo-Johnson inverse
+        transform with parameter lambda.
+
+        Notes
+        -----
+        We're comparing lmbda to 1e-19 instead of strict equality to 0. See
+        scipy/special/_boxcox.pxd for a rationale behind this
+        """
+        x_inv = np.zeros(x.shape, dtype=x.dtype)
+        pos = x >= 0
+
+        # when x >= 0
+        if lmbda < 1e-19:
+            x_inv[pos] = np.exp(x[pos]) - 1
+        else:  # lmbda != 0
+            x_inv[pos] = np.power(x[pos] * lmbda + 1, 1 / lmbda) - 1
+
+        # when x < 0
+        if lmbda < 2 - 1e-19:
+            x_inv[~pos] = 1 - np.power(-(2 - lmbda) * x[~pos] + 1,
+                                       1 / (2 - lmbda))
+        else:  # lmbda == 2
+            x_inv[~pos] = 1 - np.exp(-x[~pos])
+
+        return x_inv
+
+    def _yeo_johnson_transform(self, x, lmbda):
+        """Return transformed input x following Yeo-Johnson transform with
+        parameter lambda.
+
+        Notes
+        -----
+        We're comparing lmbda to 1e-19 instead of strict equality to 0. See
+        scipy/special/_boxcox.pxd for a rationale behind this
+        """
+
+        out = np.zeros(shape=x.shape, dtype=x.dtype)
+        pos = x >= 0  # binary mask
+
+        # when x >= 0
+        if lmbda < 1e-19:
+            out[pos] = np.log(x[pos] + 1)
+        else:  # lmbda != 0
+            out[pos] = (np.power(x[pos] + 1, lmbda) - 1) / lmbda
+
+        # when x < 0
+        if lmbda < 2 - 1e-19:
+            out[~pos] = -(np.power(-x[~pos] + 1, 2 - lmbda) - 1) / (2 - lmbda)
+        else:  # lmbda == 2
+            out[~pos] = -np.log(-x[~pos] + 1)
+
+        return out
+
+    def _box_cox_optimize(self, x):
+        """Find and return optimal lambda parameter of the Box-Cox transform by
+        MLE, for observed data x.
+
+        We here use scipy builtins which uses the brent optimizer.
+        """
+        # the computation of lambda is influenced by NaNs so we need to
+        # get rid of them
+        _, lmbda = stats.boxcox(x[~np.isnan(x)], lmbda=None)
+
+        return lmbda
+
+    def _yeo_johnson_optimize(self, x):
+        """Find and return optimal lambda parameter of the Yeo-Johnson
+        transform by MLE, for observed data x.
+
+        Like for Box-Cox, MLE is done via the brent optimizer.
+        """
+
+        def _neg_log_likelihood(lmbda):
+            """Return the negative log likelihood of the observed data x as a
+            function of lambda."""
+            x_trans = self._yeo_johnson_transform(x, lmbda)
+            n_samples = x.shape[0]
+
+            # Estimated mean and variance of the normal distribution
+            est_mean = x_trans.sum() / n_samples
+            est_var = np.power(x_trans - est_mean, 2).sum() / n_samples
+
+            loglike = -n_samples / 2 * np.log(est_var)
+            loglike += (lmbda - 1) * (np.sign(x) * np.log(np.abs(x) + 1)).sum()
+
+            return -loglike
+
+        # the computation of lambda is influenced by NaNs so we need to
+        # get rid of them
+        x = x[~np.isnan(x)]
+        # choosing bracket -2, 2 like for boxcox
+        return optimize.brent(_neg_log_likelihood, brack=(-2, 2))
+
     def _check_input(self, X, check_positive=False, check_shape=False,
                      check_method=False):
         """Validate the input before fit and transform.
@@ -2569,7 +2723,8 @@ def _check_input(self, X, check_positive=False, check_shape=False,
         X : array-like, shape (n_samples, n_features)
 
         check_positive : bool
-            If True, check that all data is positive and non-zero.
+            If True, check that all data is positive and non-zero (only if
+            ``self.method=='box-cox'``).
 
         check_shape : bool
             If True, check that n_features matches the length of self.lambdas_
@@ -2593,7 +2748,7 @@ def _check_input(self, X, check_positive=False, check_shape=False,
                              "than fitting data. Should have {n}, data has {m}"
                              .format(n=len(self.lambdas_), m=X.shape[1]))
 
-        valid_methods = ('box-cox',)
+        valid_methods = ('box-cox', 'yeo-johnson')
         if check_method and self.method not in valid_methods:
             raise ValueError("'method' must be one of {}, "
                              "got {} instead."
diff --git a/sklearn/preprocessing/tests/test_common.py b/sklearn/preprocessing/tests/test_common.py
index cbb77e4884040..ac904d99e8af3 100644
--- a/sklearn/preprocessing/tests/test_common.py
+++ b/sklearn/preprocessing/tests/test_common.py
@@ -41,7 +41,8 @@ def _get_valid_samples_by_column(X, col):
      (MinMaxScaler(), minmax_scale, False, False),
      (StandardScaler(), scale, False, False),
      (StandardScaler(with_mean=False), scale, True, False),
-     (PowerTransformer(), power_transform, False, True),
+     (PowerTransformer('yeo-johnson'), power_transform, False, False),
+     (PowerTransformer('box-cox'), power_transform, False, True),
      (QuantileTransformer(n_quantiles=10), quantile_transform, True, False),
      (RobustScaler(), robust_scale, False, False),
      (RobustScaler(with_centering=False), robust_scale, True, False)]
diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py
index 2ff9dfd776a03..f5ea7a9dd8edc 100644
--- a/sklearn/preprocessing/tests/test_data.py
+++ b/sklearn/preprocessing/tests/test_data.py
@@ -62,6 +62,7 @@
 from sklearn.pipeline import Pipeline
 from sklearn.model_selection import cross_val_predict
 from sklearn.svm import SVR
+from sklearn.utils import shuffle
 
 from sklearn import datasets
 
@@ -2003,13 +2004,26 @@ def test_quantile_transform_valid_axis():
                         ". Got axis=2", quantile_transform, X.T, axis=2)
 
 
-def test_power_transformer_notfitted():
-    pt = PowerTransformer(method='box-cox')
+@pytest.mark.parametrize("method", ['box-cox', 'yeo-johnson'])
+def test_power_transformer_notfitted(method):
+    pt = PowerTransformer(method=method)
     X = np.abs(X_1col)
     assert_raises(NotFittedError, pt.transform, X)
     assert_raises(NotFittedError, pt.inverse_transform, X)
 
 
+@pytest.mark.parametrize('method', ['box-cox', 'yeo-johnson'])
+@pytest.mark.parametrize('standardize', [True, False])
+@pytest.mark.parametrize('X', [X_1col, X_2d])
+def test_power_transformer_inverse(method, standardize, X):
+    # Make sure we get the original input when applying transform and then
+    # inverse transform
+    X = np.abs(X) if method == 'box-cox' else X
+    pt = PowerTransformer(method=method, standardize=standardize)
+    X_trans = pt.fit_transform(X)
+    assert_almost_equal(X, pt.inverse_transform(X_trans))
+
+
 def test_power_transformer_1d():
     X = np.abs(X_1col)
 
@@ -2061,11 +2075,12 @@ def test_power_transformer_2d():
         assert isinstance(pt.lambdas_, np.ndarray)
 
 
-def test_power_transformer_strictly_positive_exception():
+def test_power_transformer_boxcox_strictly_positive_exception():
+    # Exceptions should be raised for negative arrays and zero arrays when
+    # method is boxcox
+
     pt = PowerTransformer(method='box-cox')
     pt.fit(np.abs(X_2d))
-
-    # Exceptions should be raised for negative arrays and zero arrays
     X_with_negatives = X_2d
     not_positive_message = 'strictly positive'
 
@@ -2076,7 +2091,7 @@ def test_power_transformer_strictly_positive_exception():
                          pt.fit, X_with_negatives)
 
     assert_raise_message(ValueError, not_positive_message,
-                         power_transform, X_with_negatives)
+                         power_transform, X_with_negatives, 'box-cox')
 
     assert_raise_message(ValueError, not_positive_message,
                          pt.transform, np.zeros(X_2d.shape))
@@ -2085,11 +2100,19 @@ def test_power_transformer_strictly_positive_exception():
                          pt.fit, np.zeros(X_2d.shape))
 
     assert_raise_message(ValueError, not_positive_message,
-                         power_transform, np.zeros(X_2d.shape))
+                         power_transform, np.zeros(X_2d.shape), 'box-cox')
 
 
-def test_power_transformer_shape_exception():
-    pt = PowerTransformer(method='box-cox')
+@pytest.mark.parametrize('X', [X_2d, np.abs(X_2d), -np.abs(X_2d),
+                               np.zeros(X_2d.shape)])
+def test_power_transformer_yeojohnson_any_input(X):
+    # Yeo-Johnson method should support any kind of input
+    power_transform(X, method='yeo-johnson')
+
+
+@pytest.mark.parametrize("method", ['box-cox', 'yeo-johnson'])
+def test_power_transformer_shape_exception(method):
+    pt = PowerTransformer(method=method)
     X = np.abs(X_2d)
     pt.fit(X)
 
@@ -2122,3 +2145,136 @@ def test_power_transformer_lambda_zero():
     pt.lambdas_ = np.array([0])
     X_trans = pt.transform(X)
     assert_array_almost_equal(pt.inverse_transform(X_trans), X)
+
+
+def test_power_transformer_lambda_one():
+    # Make sure lambda = 1 corresponds to the identity for yeo-johnson
+    pt = PowerTransformer(method='yeo-johnson', standardize=False)
+    X = np.abs(X_2d)[:, 0:1]
+
+    pt.lambdas_ = np.array([1])
+    X_trans = pt.transform(X)
+    assert_array_almost_equal(X_trans, X)
+
+
+@pytest.mark.parametrize("method, lmbda", [('box-cox', .1),
+                                           ('box-cox', .5),
+                                           ('yeo-johnson', .1),
+                                           ('yeo-johnson', .5),
+                                           ('yeo-johnson', 1.),
+                                           ])
+def test_optimization_power_transformer(method, lmbda):
+    # Test the optimization procedure:
+    # - set a predefined value for lambda
+    # - apply inverse_transform to a normal dist (we get X_inv)
+    # - apply fit_transform to X_inv (we get X_inv_trans)
+    # - check that X_inv_trans is roughly equal to X
+
+    rng = np.random.RandomState(0)
+    n_samples = 20000
+    X = rng.normal(loc=0, scale=1, size=(n_samples, 1))
+
+    pt = PowerTransformer(method=method, standardize=False)
+    pt.lambdas_ = [lmbda]
+    X_inv = pt.inverse_transform(X)
+
+    pt = PowerTransformer(method=method, standardize=False)
+    X_inv_trans = pt.fit_transform(X_inv)
+
+    assert_almost_equal(0, np.linalg.norm(X - X_inv_trans) / n_samples,
+                        decimal=2)
+    assert_almost_equal(0, X_inv_trans.mean(), decimal=1)
+    assert_almost_equal(1, X_inv_trans.std(), decimal=1)
+
+
+@pytest.mark.parametrize('method', ['box-cox', 'yeo-johnson'])
+def test_power_transformer_nans(method):
+    # Make sure lambda estimation is not influenced by NaN values
+    # and that transform() supports NaN silently
+
+    X = np.abs(X_1col)
+    pt = PowerTransformer(method=method)
+    pt.fit(X)
+    lmbda_no_nans = pt.lambdas_[0]
+
+    # concat nans at the end and check lambda stays the same
+    X = np.concatenate([X, np.full_like(X, np.nan)])
+    X = shuffle(X, random_state=0)
+
+    pt.fit(X)
+    lmbda_nans = pt.lambdas_[0]
+
+    assert_almost_equal(lmbda_no_nans, lmbda_nans, decimal=5)
+
+    X_trans = pt.transform(X)
+    assert_array_equal(np.isnan(X_trans), np.isnan(X))
+
+
+@pytest.mark.parametrize('method', ['box-cox', 'yeo-johnson'])
+@pytest.mark.parametrize('standardize', [True, False])
+def test_power_transformer_fit_transform(method, standardize):
+    # check that fit_transform() and fit().transform() return the same values
+    X = X_1col
+    if method == 'box-cox':
+        X = np.abs(X)
+
+    pt = PowerTransformer(method, standardize)
+    assert_array_almost_equal(pt.fit(X).transform(X), pt.fit_transform(X))
+
+
+@pytest.mark.parametrize('method', ['box-cox', 'yeo-johnson'])
+@pytest.mark.parametrize('standardize', [True, False])
+def test_power_transformer_copy_True(method, standardize):
+    # Check that neither fit, transform, fit_transform nor inverse_transform
+    # modify X inplace when copy=True
+    X = X_1col
+    if method == 'box-cox':
+        X = np.abs(X)
+
+    X_original = X.copy()
+    assert X is not X_original  # sanity checks
+    assert_array_almost_equal(X, X_original)
+
+    pt = PowerTransformer(method, standardize, copy=True)
+
+    pt.fit(X)
+    assert_array_almost_equal(X, X_original)
+    X_trans = pt.transform(X)
+    assert X_trans is not X
+
+    X_trans = pt.fit_transform(X)
+    assert_array_almost_equal(X, X_original)
+    assert X_trans is not X
+
+    X_inv_trans = pt.inverse_transform(X_trans)
+    assert X_trans is not X_inv_trans
+
+
+@pytest.mark.parametrize('method', ['box-cox', 'yeo-johnson'])
+@pytest.mark.parametrize('standardize', [True, False])
+def test_power_transformer_copy_False(method, standardize):
+    # check that when copy=False fit doesn't change X inplace but transform,
+    # fit_transform and inverse_transform do.
+    X = X_1col
+    if method == 'box-cox':
+        X = np.abs(X)
+
+    X_original = X.copy()
+    assert X is not X_original  # sanity checks
+    assert_array_almost_equal(X, X_original)
+
+    pt = PowerTransformer(method, standardize, copy=False)
+
+    pt.fit(X)
+    assert_array_almost_equal(X, X_original)  # fit didn't change X
+
+    X_trans = pt.transform(X)
+    assert X_trans is X
+
+    if method == 'box-cox':
+        X = np.abs(X)
+    X_trans = pt.fit_transform(X)
+    assert X_trans is X
+
+    X_inv_trans = pt.inverse_transform(X_trans)
+    assert X_trans is X_inv_trans
diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
index 5149900c9c473..f15f8e5c86f91 100644
--- a/sklearn/utils/estimator_checks.py
+++ b/sklearn/utils/estimator_checks.py
@@ -896,9 +896,6 @@ def check_transformer_general(name, transformer, readonly_memmap=False):
                       random_state=0, n_features=2, cluster_std=0.1)
     X = StandardScaler().fit_transform(X)
     X -= X.min()
-    if name == 'PowerTransformer':
-        # Box-Cox requires positive, non-zero data
-        X += 1
 
     if readonly_memmap:
         X, y = create_memmap_backed_data([X, y])
@@ -1024,9 +1021,6 @@ def check_pipeline_consistency(name, estimator_orig):
     X, y = make_blobs(n_samples=30, centers=[[0, 0, 0], [1, 1, 1]],
                       random_state=0, n_features=2, cluster_std=0.1)
     X -= X.min()
-    if name == 'PowerTransformer':
-        # Box-Cox requires positive, non-zero data
-        X += 1
     X = pairwise_estimator_convert_X(X, estimator_orig, kernel=rbf_kernel)
     estimator = clone(estimator_orig)
     y = multioutput_estimator_convert_y_2d(estimator, y)
@@ -1088,9 +1082,6 @@ def check_estimators_dtypes(name, estimator_orig):
     methods = ["predict", "transform", "decision_function", "predict_proba"]
 
     for X_train in [X_train_32, X_train_64, X_train_int_64, X_train_int_32]:
-        if name == 'PowerTransformer':
-            # Box-Cox requires positive, non-zero data
-            X_train = np.abs(X_train) + 1
         estimator = clone(estimator_orig)
         set_random_state(estimator, 1)
         estimator.fit(X_train, y)
@@ -1205,9 +1196,6 @@ def check_estimators_pickle(name, estimator_orig):
 
     # some estimators can't do features less than 0
     X -= X.min()
-    if name == 'PowerTransformer':
-        # Box-Cox requires positive, non-zero data
-        X += 1
     X = pairwise_estimator_convert_X(X, estimator_orig, kernel=rbf_kernel)
 
     # include NaN values when the estimator should deal with them
@@ -1575,9 +1563,6 @@ def check_estimators_fit_returns_self(name, estimator_orig,
     X, y = make_blobs(random_state=0, n_samples=9, n_features=4)
     # some want non-negative input
     X -= X.min()
-    if name == 'PowerTransformer':
-        # Box-Cox requires positive, non-zero data
-        X += 1
     X = pairwise_estimator_convert_X(X, estimator_orig)
 
     estimator = clone(estimator_orig)
@@ -1939,9 +1924,6 @@ def check_estimators_overwrite_params(name, estimator_orig):
     X, y = make_blobs(random_state=0, n_samples=9)
     # some want non-negative input
     X -= X.min()
-    if name == 'PowerTransformer':
-        # Box-Cox requires positive, non-zero data
-        X += 1
     X = pairwise_estimator_convert_X(X, estimator_orig, kernel=rbf_kernel)
     estimator = clone(estimator_orig)
     y = multioutput_estimator_convert_y_2d(estimator, y)