Thanks to visit codestin.com
Credit goes to github.com

Skip to content

[MRG] FIX and ENH in _RidgeGCV #15648

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 23 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
86 changes: 58 additions & 28 deletions sklearn/linear_model/_ridge.py
Original file line number Diff line number Diff line change
Expand Up @@ -1054,6 +1054,16 @@ def _matmat(self, v):
return res


class _IdentityEstimator:
"""Hack to call a scorer when we already have the predictions."""

def decision_function(self, y_predict):
return y_predict

def predict(self, y_predict):
return y_predict


class _RidgeGCV(LinearModel):
"""Ridge regression with built-in Generalized Cross-Validation

Expand Down Expand Up @@ -1087,6 +1097,10 @@ class _RidgeGCV(LinearModel):

looe = y - loov = c / diag(G^-1)

The best score (negative mean squared error or user-provided scoring) is
stored in the `best_score_` attribute, and the selected hyperparameter in
`alpha_`.

References
----------
http://cbcl.mit.edu/publications/ps/MIT-CSAIL-TR-2007-025.pdf
Expand Down Expand Up @@ -1460,45 +1474,59 @@ def fit(self, X, y, sample_weight=None):
X, y = _rescale_data(X, y, sample_weight)
sqrt_sw = np.sqrt(sample_weight)
else:
sqrt_sw = np.ones(X.shape[0], dtype=X.dtype)
sample_weight = sqrt_sw = np.ones(X.shape[0], dtype=X.dtype)

X_mean, *decomposition = decompose(X, y, sqrt_sw)

scorer = check_scoring(self, scoring=self.scoring, allow_none=True)
error = scorer is None

n_y = 1 if len(y.shape) == 1 else y.shape[1]
cv_values = np.zeros((n_samples * n_y, len(self.alphas)),
dtype=X.dtype)
C = []
X_mean, *decomposition = decompose(X, y, sqrt_sw)

if self.store_cv_values:
self.cv_values_ = np.empty(
(n_samples * n_y, len(self.alphas)), dtype=X.dtype)

best_coef, best_score, best_alpha = None, None, None

for i, alpha in enumerate(self.alphas):
G_inverse_diag, c = solve(
float(alpha), y, sqrt_sw, X_mean, *decomposition)
if error:
squared_errors = (c / G_inverse_diag) ** 2
cv_values[:, i] = squared_errors.ravel()
# convert errors back to the original space
# return converted errors
# calculate scores based on converted errors
if y.ndim == 2:
squared_errors /= sample_weight[:, np.newaxis]
else:
squared_errors /= sample_weight
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is not that easy. There is a test ridge_sample_weight which will fail.
Right now it was thought that repeating 3 times a sample will lead to an error 3 times bigger.
Normalizing the sample_weight will not lead to this results

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this part makes sure that RidgeCV() is equivalent to GridSearchCV(Ridge(), cv=LeaveOneOut())

# consistent with default multioutput of mean_squared_error
alpha_score = -squared_errors.mean()
if self.store_cv_values:
self.cv_values_[:, i] = squared_errors.ravel()
else:
predictions = y - (c / G_inverse_diag)
cv_values[:, i] = predictions.ravel()
C.append(c)

if error:
best = cv_values.mean(axis=0).argmin()
else:
# The scorer want an object that will make the predictions but
# they are already computed efficiently by _RidgeGCV. This
# identity_estimator will just return them
def identity_estimator():
pass
identity_estimator.decision_function = lambda y_predict: y_predict
identity_estimator.predict = lambda y_predict: y_predict

# signature of scorer is (estimator, X, y)
out = [scorer(identity_estimator, cv_values[:, i], y.ravel())
for i in range(len(self.alphas))]
best = np.argmax(out)

self.alpha_ = self.alphas[best]
self.dual_coef_ = C[best]
# convert predictions back to the original space
# return converted predictions
# calculate scores based on converted predictions
if y.ndim == 2:
y_true = y / sqrt_sw[:, np.newaxis] + y_offset
y_pred = predictions / sqrt_sw[:, np.newaxis] + y_offset
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you don't need to create new axis. you can ravel and use np.repeat(sqrt_sw, n_y)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this memory efficient? I guess broadcast is better?

else:
y_true = y / sqrt_sw + y_offset
y_pred = predictions / sqrt_sw + y_offset
# let the underlying scorer to handle multioutput
alpha_score = scorer(
_IdentityEstimator(), y_pred, y_true)
if self.store_cv_values:
self.cv_values_[:, i] = y_pred.ravel()
if (best_score is None) or (alpha_score > best_score):
best_coef, best_score, best_alpha = c, alpha_score, alpha

self.alpha_ = best_alpha
self.best_score_ = best_score
self.dual_coef_ = best_coef
self.coef_ = safe_sparse_dot(self.dual_coef_.T, X)

X_offset += X_mean * X_scale
Expand All @@ -1509,7 +1537,7 @@ def identity_estimator():
cv_values_shape = n_samples, len(self.alphas)
else:
cv_values_shape = n_samples, n_y, len(self.alphas)
self.cv_values_ = cv_values.reshape(cv_values_shape)
self.cv_values_ = self.cv_values_.reshape(cv_values_shape)

return self

Expand Down Expand Up @@ -1565,6 +1593,7 @@ def fit(self, X, y, sample_weight=None):
store_cv_values=self.store_cv_values)
estimator.fit(X, y, sample_weight=sample_weight)
self.alpha_ = estimator.alpha_
self.best_score_ = estimator.best_score_
if self.store_cv_values:
self.cv_values_ = estimator.cv_values_
else:
Expand All @@ -1580,6 +1609,7 @@ def fit(self, X, y, sample_weight=None):
gs.fit(X, y, sample_weight=sample_weight)
estimator = gs.best_estimator_
self.alpha_ = gs.best_estimator_.alpha
self.best_score_ = gs.best_score_

self.coef_ = estimator.coef_
self.intercept_ = estimator.intercept_
Expand Down
163 changes: 102 additions & 61 deletions sklearn/linear_model/tests/test_ridge.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from sklearn.metrics import mean_squared_error
from sklearn.metrics import make_scorer
from sklearn.metrics import get_scorer
from sklearn.metrics import r2_score

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import ridge_regression
Expand All @@ -36,7 +37,9 @@
from sklearn.datasets import make_regression

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold, GroupKFold, cross_val_predict
from sklearn.model_selection import KFold
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import cross_val_predict

from sklearn.utils import check_random_state
from sklearn.datasets import make_multilabel_classification
Expand Down Expand Up @@ -493,66 +496,6 @@ def test_ridge_loo_cv_asym_scoring():
assert_allclose(gcv_ridge.intercept_, loo_ridge.intercept_, rtol=1e-3)


@pytest.mark.parametrize('gcv_mode', ['svd', 'eigen'])
@pytest.mark.parametrize('X_constructor', [np.asarray, sp.csr_matrix])
@pytest.mark.parametrize('n_features', [8, 20])
@pytest.mark.parametrize('y_shape, fit_intercept, noise',
[((11,), True, 1.),
((11, 1), True, 20.),
((11, 3), True, 150.),
((11, 3), False, 30.)])
def test_ridge_gcv_sample_weights(
gcv_mode, X_constructor, fit_intercept, n_features, y_shape, noise):
alphas = [1e-3, .1, 1., 10., 1e3]
rng = np.random.RandomState(0)
n_targets = y_shape[-1] if len(y_shape) == 2 else 1
X, y = _make_sparse_offset_regression(
n_samples=11, n_features=n_features, n_targets=n_targets,
random_state=0, shuffle=False, noise=noise)
y = y.reshape(y_shape)

sample_weight = 3 * rng.randn(len(X))
sample_weight = (sample_weight - sample_weight.min() + 1).astype(int)
indices = np.repeat(np.arange(X.shape[0]), sample_weight)
sample_weight = sample_weight.astype(float)
X_tiled, y_tiled = X[indices], y[indices]

cv = GroupKFold(n_splits=X.shape[0])
splits = cv.split(X_tiled, y_tiled, groups=indices)
kfold = RidgeCV(
alphas=alphas, cv=splits, scoring='neg_mean_squared_error',
fit_intercept=fit_intercept)
# ignore warning from GridSearchCV: FutureWarning: The default
# of the `iid` parameter will change from True to False in version 0.22
# and will be removed in 0.24
with ignore_warnings(category=FutureWarning):
kfold.fit(X_tiled, y_tiled)

ridge_reg = Ridge(alpha=kfold.alpha_, fit_intercept=fit_intercept)
splits = cv.split(X_tiled, y_tiled, groups=indices)
predictions = cross_val_predict(ridge_reg, X_tiled, y_tiled, cv=splits)
kfold_errors = (y_tiled - predictions)**2
kfold_errors = [
np.sum(kfold_errors[indices == i], axis=0) for
i in np.arange(X.shape[0])]
kfold_errors = np.asarray(kfold_errors)

X_gcv = X_constructor(X)
gcv_ridge = RidgeCV(
alphas=alphas, store_cv_values=True,
gcv_mode=gcv_mode, fit_intercept=fit_intercept)
gcv_ridge.fit(X_gcv, y, sample_weight=sample_weight)
if len(y_shape) == 2:
gcv_errors = gcv_ridge.cv_values_[:, :, alphas.index(kfold.alpha_)]
else:
gcv_errors = gcv_ridge.cv_values_[:, alphas.index(kfold.alpha_)]

assert kfold.alpha_ == pytest.approx(gcv_ridge.alpha_)
assert_allclose(gcv_errors, kfold_errors, rtol=1e-3)
assert_allclose(gcv_ridge.coef_, kfold.coef_, rtol=1e-3)
assert_allclose(gcv_ridge.intercept_, kfold.intercept_, rtol=1e-3)


@pytest.mark.parametrize('mode', [True, 1, 5, 'bad', 'gcv'])
def test_check_gcv_mode_error(mode):
X, y = make_regression(n_samples=5, n_features=2)
Expand Down Expand Up @@ -1247,3 +1190,101 @@ def test_ridge_sag_with_X_fortran():
X = X[::2, :]
y = y[::2]
Ridge(solver='sag').fit(X, y)


@pytest.mark.parametrize("fit_intercept", [True, False])
@pytest.mark.parametrize("use_sample_weight", [True, False])
@pytest.mark.parametrize("multioutput", [True, False])
def test_ridgecv_default_scorer_consistency(fit_intercept, use_sample_weight,
multioutput):
if multioutput:
X, y = make_regression(n_samples=10, n_features=5,
n_targets=3, random_state=0)
else:
X, y = make_regression(n_samples=10, n_features=5, random_state=0)
if use_sample_weight:
rng = np.random.RandomState(0)
sample_weight = rng.rand(X.shape[0])
else:
sample_weight = None
alphas = [0.1, 1, 10]

clf1 = RidgeCV(
fit_intercept=fit_intercept, scoring=None,
store_cv_values=True, alphas=alphas
)
clf1.fit(X, y, sample_weight=sample_weight)

# check consistency between RidgeCV(scoring=None) and
# RidgeCV(scoring="neg_mean_squared_error")
clf2 = RidgeCV(
fit_intercept=fit_intercept, scoring="neg_mean_squared_error",
store_cv_values=True, alphas=alphas
)
clf2.fit(X, y, sample_weight=sample_weight)
assert clf1.alpha_ == pytest.approx(clf2.alpha_)
assert clf1.best_score_ == pytest.approx(clf2.best_score_)
assert_array_almost_equal(clf1.coef_, clf2.coef_)
assert_array_almost_equal(clf1.intercept_, clf2.intercept_)
if multioutput:
cv_results_1 = clf1.cv_values_[:, :, alphas.index(clf1.alpha_)]
cv_results_2 = clf2.cv_values_[:, :, alphas.index(clf2.alpha_)]
else:
cv_results_1 = clf1.cv_values_[:, alphas.index(clf1.alpha_)]
cv_results_2 = clf2.cv_values_[:, alphas.index(clf2.alpha_)]
assert_array_almost_equal(cv_results_1, (y - cv_results_2) ** 2)
assert (clf1.best_score_ ==
pytest.approx(-mean_squared_error(y, cv_results_2)))

# check consistency between RidgeCV and GridSearCV
# this is true for specific scorer
clf2 = GridSearchCV(Ridge(fit_intercept=fit_intercept), {"alpha": alphas},
scoring="neg_mean_squared_error", cv=LeaveOneOut())
clf2.fit(X, y, sample_weight=sample_weight)
assert clf1.alpha_ == pytest.approx(clf2.best_params_["alpha"])
assert clf1.best_score_ == pytest.approx(clf2.best_score_)
assert_array_almost_equal(clf1.coef_, clf2.best_estimator_.coef_)
assert_array_almost_equal(clf1.intercept_, clf2.best_estimator_.intercept_)

# check consistency between RidgeCV and cross_val_predict
# this is true for arbitrary scorer
ridge = Ridge(alpha=clf1.alpha_, fit_intercept=fit_intercept)
loo_pred = cross_val_predict(ridge, X, y, cv=LeaveOneOut(),
fit_params={"sample_weight": sample_weight})
assert_array_almost_equal(loo_pred, cv_results_2)


@pytest.mark.parametrize("fit_intercept", [True, False])
@pytest.mark.parametrize("use_sample_weight", [True, False])
@pytest.mark.parametrize("multioutput", [True, False])
def test_ridgecv_custom_scorer_consistency(fit_intercept, use_sample_weight,
multioutput):
if multioutput:
X, y = make_regression(n_samples=10, n_features=5,
n_targets=3, random_state=0)
else:
X, y = make_regression(n_samples=10, n_features=5, random_state=0)
if use_sample_weight:
rng = np.random.RandomState(0)
sample_weight = rng.rand(X.shape[0])
else:
sample_weight = None

alphas = [0.1, 1, 10]
clf = RidgeCV(
fit_intercept=fit_intercept, scoring="r2",
store_cv_values=True, alphas=alphas
)
clf.fit(X, y, sample_weight=sample_weight)
if multioutput:
cv_results = clf.cv_values_[:, :, alphas.index(clf.alpha_)]
else:
cv_results = clf.cv_values_[:, alphas.index(clf.alpha_)]
assert clf.best_score_ == pytest.approx(r2_score(y, cv_results))

# check consistency between RidgeCV and cross_val_predict
# this is true for arbitrary scorer
ridge = Ridge(alpha=clf.alpha_, fit_intercept=fit_intercept)
loo_pred = cross_val_predict(ridge, X, y, cv=LeaveOneOut(),
fit_params={"sample_weight": sample_weight})
assert_array_almost_equal(loo_pred, cv_results)