|
33 | 33 |
|
34 | 34 | Since our loss function is dependent on the amount of samples, the latter
|
35 | 35 | will influence the selected value of `C`.
|
36 |
| -The question that arises is `How do we optimally adjust C to |
37 |
| -account for the different amount of training samples?` |
38 |
| -
|
39 |
| -The figures below are used to illustrate the effect of scaling our |
40 |
| -`C` to compensate for the change in the number of samples, in the |
41 |
| -case of using an `l1` penalty, as well as the `l2` penalty. |
42 |
| -
|
43 |
| -l1-penalty case |
44 |
| ------------------ |
45 |
| -In the `l1` case, theory says that prediction consistency |
46 |
| -(i.e. that under given hypothesis, the estimator |
47 |
| -learned predicts as well as a model knowing the true distribution) |
48 |
| -is not possible because of the bias of the `l1`. It does say, however, |
49 |
| -that model consistency, in terms of finding the right set of non-zero |
50 |
| -parameters as well as their signs, can be achieved by scaling |
51 |
| -`C1`. |
52 |
| -
|
53 |
| -l2-penalty case |
54 |
| ------------------ |
55 |
| -The theory says that in order to achieve prediction consistency, the |
56 |
| -penalty parameter should be kept constant |
57 |
| -as the number of samples grow. |
58 |
| -
|
59 |
| -Simulations |
60 |
| ------------- |
61 |
| -
|
62 |
| -The two figures below plot the values of `C` on the `x-axis` and the |
63 |
| -corresponding cross-validation scores on the `y-axis`, for several different |
64 |
| -fractions of a generated data-set. |
65 |
| -
|
66 |
| -In the `l1` penalty case, the cross-validation-error correlates best with |
67 |
| -the test-error, when scaling our `C` with the number of samples, `n`, |
68 |
| -which can be seen in the first figure. |
69 |
| -
|
70 |
| -For the `l2` penalty case, the best result comes from the case where `C` |
71 |
| -is not scaled. |
72 |
| -
|
73 |
| -.. topic:: Note: |
74 |
| -
|
75 |
| - Two separate datasets are used for the two different plots. The reason |
76 |
| - behind this is the `l1` case works better on sparse data, while `l2` |
77 |
| - is better suited to the non-sparse case. |
| 36 | +The question that arises is "How do we optimally adjust C to |
| 37 | +account for the different amount of training samples?" |
78 | 38 |
|
| 39 | +In the remainder of this example, we will investigate the effect of scaling |
| 40 | +the value of the regularization parameter `C` in regards to the number of |
| 41 | +samples for both L1 and L2 penalty. We will generate some synthetic datasets |
| 42 | +that are appropriate for each type of regularization. |
79 | 43 | """
|
80 | 44 |
|
81 | 45 | # Author: Andreas Mueller <[email protected]>
|
82 | 46 | # Jaques Grobler <[email protected]>
|
83 | 47 | # License: BSD 3 clause
|
84 | 48 |
|
| 49 | +# %% |
| 50 | +# L1-penalty case |
| 51 | +# --------------- |
| 52 | +# In the L1 case, theory says that prediction consistency (i.e. that under |
| 53 | +# given hypothesis, the estimator learned predicts as well as a model knowing |
| 54 | +# the true distribution) is not possible because of the bias of the L1. It |
| 55 | +# does say, however, that model consistency, in terms of finding the right set |
| 56 | +# of non-zero parameters as well as their signs, can be achieved by scaling |
| 57 | +# `C`. |
| 58 | +# |
| 59 | +# We will demonstrate this effect by using a synthetic dataset. This |
| 60 | +# dataset will be sparse, meaning that only a few features will be informative |
| 61 | +# and useful for the model. |
| 62 | +from sklearn.datasets import make_classification |
| 63 | + |
| 64 | +n_samples, n_features = 100, 300 |
| 65 | +X, y = make_classification( |
| 66 | + n_samples=n_samples, n_features=n_features, n_informative=5, random_state=1 |
| 67 | +) |
| 68 | + |
| 69 | +# %% |
| 70 | +# Now, we can define a linear SVC with the `l1` penalty. |
| 71 | +from sklearn.svm import LinearSVC |
| 72 | + |
| 73 | +model_l1 = LinearSVC(penalty="l1", loss="squared_hinge", dual=False, tol=1e-3) |
| 74 | + |
| 75 | +# %% |
| 76 | +# We will compute the mean test score for different values of `C`. |
85 | 77 | import numpy as np
|
| 78 | +import pandas as pd |
| 79 | +from sklearn.model_selection import validation_curve, ShuffleSplit |
| 80 | + |
| 81 | +Cs = np.logspace(-2.3, -1.3, 10) |
| 82 | +train_sizes = np.linspace(0.3, 0.7, 3) |
| 83 | +labels = [f"fraction: {train_size}" for train_size in train_sizes] |
| 84 | + |
| 85 | +results = {"C": Cs} |
| 86 | +for label, train_size in zip(labels, train_sizes): |
| 87 | + cv = ShuffleSplit(train_size=train_size, test_size=0.3, n_splits=50, random_state=1) |
| 88 | + train_scores, test_scores = validation_curve( |
| 89 | + model_l1, X, y, param_name="C", param_range=Cs, cv=cv |
| 90 | + ) |
| 91 | + results[label] = test_scores.mean(axis=1) |
| 92 | +results = pd.DataFrame(results) |
| 93 | + |
| 94 | +# %% |
86 | 95 | import matplotlib.pyplot as plt
|
87 | 96 |
|
88 |
| -from sklearn.svm import LinearSVC |
89 |
| -from sklearn.model_selection import ShuffleSplit |
90 |
| -from sklearn.model_selection import GridSearchCV |
91 |
| -from sklearn.utils import check_random_state |
92 |
| -from sklearn import datasets |
| 97 | +fig, axes = plt.subplots(nrows=1, ncols=2, sharey=True, figsize=(12, 6)) |
| 98 | + |
| 99 | +# plot results without scaling C |
| 100 | +results.plot(x="C", ax=axes[0], logx=True) |
| 101 | +axes[0].set_ylabel("CV score") |
| 102 | +axes[0].set_title("No scaling") |
| 103 | + |
| 104 | +# plot results by scaling C |
| 105 | +for train_size_idx, label in enumerate(labels): |
| 106 | + results_scaled = results[[label]].assign( |
| 107 | + C_scaled=Cs * float(n_samples * train_sizes[train_size_idx]) |
| 108 | + ) |
| 109 | + results_scaled.plot(x="C_scaled", ax=axes[1], logx=True, label=label) |
| 110 | +axes[1].set_title("Scaling C by 1 / n_samples") |
| 111 | + |
| 112 | +_ = fig.suptitle("Effect of scaling C with L1 penalty") |
| 113 | + |
| 114 | +# %% |
| 115 | +# Here, we observe that the cross-validation-error correlates best with the |
| 116 | +# test-error, when scaling our `C` with the number of samples, `n`. |
| 117 | +# |
| 118 | +# L2-penalty case |
| 119 | +# --------------- |
| 120 | +# We can repeat a similar experiment with the `l2` penalty. In this case, we |
| 121 | +# don't need to use a sparse dataset. |
| 122 | +# |
| 123 | +# In this case, the theory says that in order to achieve prediction |
| 124 | +# consistency, the penalty parameter should be kept constant as the number of |
| 125 | +# samples grow. |
| 126 | +# |
| 127 | +# So we will repeat the same experiment by creating a linear SVC classifier |
| 128 | +# with the `l2` penalty and check the test score via cross-validation and |
| 129 | +# plot the results with and without scaling the parameter `C`. |
| 130 | +rng = np.random.RandomState(1) |
| 131 | +y = np.sign(0.5 - rng.rand(n_samples)) |
| 132 | +X = rng.randn(n_samples, n_features // 5) + y[:, np.newaxis] |
| 133 | +X += 5 * rng.randn(n_samples, n_features // 5) |
| 134 | + |
| 135 | +# %% |
| 136 | +model_l2 = LinearSVC(penalty="l2", loss="squared_hinge", dual=True) |
| 137 | +Cs = np.logspace(-4.5, -2, 10) |
| 138 | + |
| 139 | +labels = [f"fraction: {train_size}" for train_size in train_sizes] |
| 140 | +results = {"C": Cs} |
| 141 | +for label, train_size in zip(labels, train_sizes): |
| 142 | + cv = ShuffleSplit(train_size=train_size, test_size=0.3, n_splits=50, random_state=1) |
| 143 | + train_scores, test_scores = validation_curve( |
| 144 | + model_l2, X, y, param_name="C", param_range=Cs, cv=cv |
| 145 | + ) |
| 146 | + results[label] = test_scores.mean(axis=1) |
| 147 | +results = pd.DataFrame(results) |
| 148 | + |
| 149 | +# %% |
| 150 | +import matplotlib.pyplot as plt |
93 | 151 |
|
94 |
| -rnd = check_random_state(1) |
| 152 | +fig, axes = plt.subplots(nrows=1, ncols=2, sharey=True, figsize=(12, 6)) |
95 | 153 |
|
96 |
| -# set up dataset |
97 |
| -n_samples = 100 |
98 |
| -n_features = 300 |
| 154 | +# plot results without scaling C |
| 155 | +results.plot(x="C", ax=axes[0], logx=True) |
| 156 | +axes[0].set_ylabel("CV score") |
| 157 | +axes[0].set_title("No scaling") |
99 | 158 |
|
100 |
| -# l1 data (only 5 informative features) |
101 |
| -X_1, y_1 = datasets.make_classification( |
102 |
| - n_samples=n_samples, n_features=n_features, n_informative=5, random_state=1 |
103 |
| -) |
| 159 | +# plot results by scaling C |
| 160 | +for train_size_idx, label in enumerate(labels): |
| 161 | + results_scaled = results[[label]].assign( |
| 162 | + C_scaled=Cs * float(n_samples * train_sizes[train_size_idx]) |
| 163 | + ) |
| 164 | + results_scaled.plot(x="C_scaled", ax=axes[1], logx=True, label=label) |
| 165 | +axes[1].set_title("Scaling C by 1 / n_samples") |
| 166 | + |
| 167 | +_ = fig.suptitle("Effect of scaling C with L2 penalty") |
104 | 168 |
|
105 |
| -# l2 data: non sparse, but less features |
106 |
| -y_2 = np.sign(0.5 - rnd.rand(n_samples)) |
107 |
| -X_2 = rnd.randn(n_samples, n_features // 5) + y_2[:, np.newaxis] |
108 |
| -X_2 += 5 * rnd.randn(n_samples, n_features // 5) |
109 |
| - |
110 |
| -clf_sets = [ |
111 |
| - ( |
112 |
| - LinearSVC(penalty="l1", loss="squared_hinge", dual=False, tol=1e-3), |
113 |
| - np.logspace(-2.3, -1.3, 10), |
114 |
| - X_1, |
115 |
| - y_1, |
116 |
| - ), |
117 |
| - ( |
118 |
| - LinearSVC(penalty="l2", loss="squared_hinge", dual=True), |
119 |
| - np.logspace(-4.5, -2, 10), |
120 |
| - X_2, |
121 |
| - y_2, |
122 |
| - ), |
123 |
| -] |
124 |
| - |
125 |
| -colors = ["navy", "cyan", "darkorange"] |
126 |
| -lw = 2 |
127 |
| - |
128 |
| -for clf, cs, X, y in clf_sets: |
129 |
| - # set up the plot for each regressor |
130 |
| - fig, axes = plt.subplots(nrows=2, sharey=True, figsize=(9, 10)) |
131 |
| - |
132 |
| - for k, train_size in enumerate(np.linspace(0.3, 0.7, 3)[::-1]): |
133 |
| - param_grid = dict(C=cs) |
134 |
| - # To get nice curve, we need a large number of iterations to |
135 |
| - # reduce the variance |
136 |
| - grid = GridSearchCV( |
137 |
| - clf, |
138 |
| - refit=False, |
139 |
| - param_grid=param_grid, |
140 |
| - cv=ShuffleSplit( |
141 |
| - train_size=train_size, test_size=0.3, n_splits=50, random_state=1 |
142 |
| - ), |
143 |
| - ) |
144 |
| - grid.fit(X, y) |
145 |
| - scores = grid.cv_results_["mean_test_score"] |
146 |
| - |
147 |
| - scales = [ |
148 |
| - (1, "No scaling"), |
149 |
| - ((n_samples * train_size), "1/n_samples"), |
150 |
| - ] |
151 |
| - |
152 |
| - for ax, (scaler, name) in zip(axes, scales): |
153 |
| - ax.set_xlabel("C") |
154 |
| - ax.set_ylabel("CV Score") |
155 |
| - grid_cs = cs * float(scaler) # scale the C's |
156 |
| - ax.semilogx( |
157 |
| - grid_cs, |
158 |
| - scores, |
159 |
| - label="fraction %.2f" % train_size, |
160 |
| - color=colors[k], |
161 |
| - lw=lw, |
162 |
| - ) |
163 |
| - ax.set_title( |
164 |
| - "scaling=%s, penalty=%s, loss=%s" % (name, clf.penalty, clf.loss) |
165 |
| - ) |
166 |
| - |
167 |
| - plt.legend(loc="best") |
| 169 | +# %% |
| 170 | +# So or the L2 penalty case, the best result comes from the case where `C` is |
| 171 | +# not scaled. |
168 | 172 | plt.show()
|
0 commit comments