Fix PLS scaling bug (#7819)

jayzed82 · lesteve · commit b15818e7e0d4 · 2017-06-09T13:27:13.000+02:00
diff --git a/doc/whats_new.rst b/doc/whats_new.rst
@@ -345,6 +345,8 @@ Bug fixes
 
    - Fixed a memory leak in our LibLinear implementation. :issue:`9024` by
      :user:`Sergei Lebedev <superbobry>`
+   - Fixed improper scaling in :class:`sklearn.cross_decomposition.PLSRegression`
+     with ``scale=True``. :issue:`7819` by :user:`jayzed82 <jayzed82>`.
 
    - Fixed oob_score in :class:`ensemble.BaggingClassifier`.
      :issue:`#8936` by :user:`mlewis1729 <mlewis1729>`
diff --git a/sklearn/cross_decomposition/pls_.py b/sklearn/cross_decomposition/pls_.py
@@ -366,8 +366,7 @@ def fit(self, X, Y):
             # Y = X W(P'W)^-1Q' + Err = XB + Err
             # => B = W*Q' (p x q)
             self.coef_ = np.dot(self.x_rotations_, self.y_loadings_.T)
-            self.coef_ = (1. / self.x_std_.reshape((p, 1)) * self.coef_ *
-                          self.y_std_)
+            self.coef_ = self.coef_ * self.y_std_
         return self
 
     def transform(self, X, Y=None, copy=True):
diff --git a/sklearn/cross_decomposition/tests/test_pls.py b/sklearn/cross_decomposition/tests/test_pls.py
@@ -1,9 +1,12 @@
 import numpy as np
+from numpy.testing import assert_approx_equal
+
 from sklearn.utils.testing import (assert_equal, assert_array_almost_equal,
                                    assert_array_equal, assert_true,
                                    assert_raise_message)
 from sklearn.datasets import load_linnerud
 from sklearn.cross_decomposition import pls_, CCA
+from sklearn.preprocessing import StandardScaler
 
 
 def test_pls():
@@ -351,11 +354,38 @@ def test_scale_and_stability():
             assert_array_almost_equal(X_s_score, X_score)
             assert_array_almost_equal(Y_s_score, Y_score)
 
+
 def test_pls_errors():
     d = load_linnerud()
     X = d.data
     Y = d.target
     for clf in [pls_.PLSCanonical(), pls_.PLSRegression(),
                 pls_.PLSSVD()]:
         clf.n_components = 4
-        assert_raise_message(ValueError, "Invalid number of components", clf.fit, X, Y)
+        assert_raise_message(ValueError, "Invalid number of components",
+                             clf.fit, X, Y)
+
+
+def test_pls_scaling():
+    # sanity check for scale=True
+    n_samples = 1000
+    n_targets = 5
+    n_features = 10
+
+    rng = np.random.RandomState(0)
+
+    Q = rng.randn(n_targets, n_features)
+    Y = rng.randn(n_samples, n_targets)
+    X = np.dot(Y, Q) + 2 * rng.randn(n_samples, n_features) + 1
+    X *= 1000
+    X_scaled = StandardScaler().fit_transform(X)
+
+    pls = pls_.PLSRegression(n_components=5, scale=True)
+
+    pls.fit(X, Y)
+    score = pls.score(X, Y)
+
+    pls.fit(X_scaled, Y)
+    score_scaled = pls.score(X_scaled, Y)
+
+    assert_approx_equal(score, score_scaled)