From e39777bf14814ed699c02417845d01dd3ea40d10 Mon Sep 17 00:00:00 2001 From: Alexandre Gramfort Date: Tue, 26 Feb 2019 15:04:00 +0100 Subject: [PATCH 1/3] FIX : make LinearRegression perfectly consistent across sparse or dense --- sklearn/linear_model/base.py | 17 ++++++++++++++--- sklearn/linear_model/tests/test_base.py | 20 ++++++++++++++++++++ 2 files changed, 34 insertions(+), 3 deletions(-) diff --git a/sklearn/linear_model/base.py b/sklearn/linear_model/base.py index eb474f8386189..61a443709e1ee 100644 --- a/sklearn/linear_model/base.py +++ b/sklearn/linear_model/base.py @@ -459,21 +459,32 @@ def fit(self, X, y, sample_weight=None): X, y, X_offset, y_offset, X_scale = self._preprocess_data( X, y, fit_intercept=self.fit_intercept, normalize=self.normalize, - copy=self.copy_X, sample_weight=sample_weight) + copy=self.copy_X, sample_weight=sample_weight, + return_mean=True) if sample_weight is not None: # Sample weight can be implemented via a simple rescaling. X, y = _rescale_data(X, y, sample_weight) if sp.issparse(X): + X_offset_scale = X_offset / X_scale + def matvec(b): + return X.dot(b) - b.dot(X_offset_scale) + def rmatvec(b): + return X.T.dot(b) - (X_offset_scale) * np.sum(b) + + X_centered = sparse.linalg.LinearOperator(shape=X.shape, + matvec=matvec, + rmatvec=rmatvec) + if y.ndim < 2: - out = sparse_lsqr(X, y) + out = sparse_lsqr(X_centered, y) self.coef_ = out[0] self._residues = out[3] else: # sparse_lstsq cannot handle y with shape (M, K) outs = Parallel(n_jobs=n_jobs_)( - delayed(sparse_lsqr)(X, y[:, j].ravel()) + delayed(sparse_lsqr)(X_centered, y[:, j].ravel()) for j in range(y.shape[1])) self.coef_ = np.vstack([out[0] for out in outs]) self._residues = np.vstack([out[3] for out in outs]) diff --git a/sklearn/linear_model/tests/test_base.py b/sklearn/linear_model/tests/test_base.py index bcabe12ed35f3..1bfe9b6934205 100644 --- a/sklearn/linear_model/tests/test_base.py +++ b/sklearn/linear_model/tests/test_base.py @@ -150,6 +150,26 @@ def test_linear_regression_sparse(random_state=0): assert_array_almost_equal(ols.predict(X) - y.ravel(), 0) +@pytest.mark.parametrize('normalize', [True, False]) +@pytest.mark.parametrize('fit_intercept', [True, False]) +def test_linear_regression_sparse_equal_dense(normalize, fit_intercept): + # Test that linear regression agrees between sparse and dense + rng = check_random_state(0) + n_samples = 200 + n_features = 2 + X = rng.randn(n_samples, n_features) + X[X < 0.1] = 0. + Xcsr = sparse.csr_matrix(X) + y = rng.rand(n_samples) + params = dict(normalize=normalize, fit_intercept=fit_intercept) + clf_dense = LinearRegression(**params) + clf_sparse = LinearRegression(**params) + clf_dense.fit(X, y) + clf_sparse.fit(Xcsr, y) + assert_almost_equal(clf_dense.intercept_, clf_sparse.intercept_) + assert_array_almost_equal(clf_dense.coef_, clf_sparse.coef_) + + def test_linear_regression_multiple_outcome(random_state=0): # Test multiple-outcome linear regressions X, y = make_regression(random_state=random_state) From d26be2e07bcb2f3b62ea9a84dae2ed0f80b4df8a Mon Sep 17 00:00:00 2001 From: Alexandre Gramfort Date: Tue, 26 Feb 2019 15:44:46 +0100 Subject: [PATCH 2/3] comments --- doc/whats_new/v0.21.rst | 4 ++++ sklearn/linear_model/tests/test_base.py | 5 +++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst index 0ad582017840f..9e4b2c4443eef 100644 --- a/doc/whats_new/v0.21.rst +++ b/doc/whats_new/v0.21.rst @@ -174,6 +174,10 @@ Support for Python 3.4 and below has been officially dropped. parameter value ``copy_X=True`` in ``fit``. :issue:`12972` by :user:`Lucio Fernandez-Arjona ` +- |Fix| Fixed a bug in :class:`linear_model.LinearRegression` that + was not returning the same coeffecient and intercepts with + ``fit_intercept=True``. :issue:`13279` by `Alexandre Gramfort`_ + :mod:`sklearn.manifold` ............................ diff --git a/sklearn/linear_model/tests/test_base.py b/sklearn/linear_model/tests/test_base.py index 1bfe9b6934205..6fca044c25d86 100644 --- a/sklearn/linear_model/tests/test_base.py +++ b/sklearn/linear_model/tests/test_base.py @@ -13,6 +13,7 @@ from sklearn.utils.testing import assert_array_almost_equal from sklearn.utils.testing import assert_almost_equal from sklearn.utils.testing import assert_equal +from sklearn.utils.testing import assert_allclose from sklearn.linear_model.base import LinearRegression from sklearn.linear_model.base import _preprocess_data @@ -166,8 +167,8 @@ def test_linear_regression_sparse_equal_dense(normalize, fit_intercept): clf_sparse = LinearRegression(**params) clf_dense.fit(X, y) clf_sparse.fit(Xcsr, y) - assert_almost_equal(clf_dense.intercept_, clf_sparse.intercept_) - assert_array_almost_equal(clf_dense.coef_, clf_sparse.coef_) + assert clf_dense.intercept_ == pytest.approx(clf_sparse.intercept_) + assert_allclose(clf_dense.coef_, clf_sparse.coef_) def test_linear_regression_multiple_outcome(random_state=0): From 50a610350f2052d2693faf1ff9b113fd46b8c6f9 Mon Sep 17 00:00:00 2001 From: Alexandre Gramfort Date: Tue, 26 Feb 2019 21:26:40 +0100 Subject: [PATCH 3/3] review --- doc/whats_new/v0.21.rst | 5 +++-- sklearn/linear_model/base.py | 4 +++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst index 9e4b2c4443eef..2494d0dbbcee2 100644 --- a/doc/whats_new/v0.21.rst +++ b/doc/whats_new/v0.21.rst @@ -175,8 +175,9 @@ Support for Python 3.4 and below has been officially dropped. :issue:`12972` by :user:`Lucio Fernandez-Arjona ` - |Fix| Fixed a bug in :class:`linear_model.LinearRegression` that - was not returning the same coeffecient and intercepts with - ``fit_intercept=True``. :issue:`13279` by `Alexandre Gramfort`_ + was not returning the same coeffecients and intercepts with + ``fit_intercept=True`` in sparse and dense case. + :issue:`13279` by `Alexandre Gramfort`_ :mod:`sklearn.manifold` ............................ diff --git a/sklearn/linear_model/base.py b/sklearn/linear_model/base.py index 61a443709e1ee..e194afa94488b 100644 --- a/sklearn/linear_model/base.py +++ b/sklearn/linear_model/base.py @@ -468,10 +468,12 @@ def fit(self, X, y, sample_weight=None): if sp.issparse(X): X_offset_scale = X_offset / X_scale + def matvec(b): return X.dot(b) - b.dot(X_offset_scale) + def rmatvec(b): - return X.T.dot(b) - (X_offset_scale) * np.sum(b) + return X.T.dot(b) - X_offset_scale * np.sum(b) X_centered = sparse.linalg.LinearOperator(shape=X.shape, matvec=matvec,