-
-
Notifications
You must be signed in to change notification settings - Fork 25.9k
[WIP] Sample weight consistency #5515
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
23109f0
7a0b752
5c882fa
10da049
d789424
0b2921a
f802d84
ddc89ce
984516e
e4e0848
020b0e4
da2a736
c3ca2f0
92a81e5
eedfada
ac281b5
2f1ea6b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -11,13 +11,23 @@ | |
import warnings | ||
import sys | ||
import pkgutil | ||
import numpy as np | ||
|
||
from sklearn import datasets | ||
from sklearn.base import is_classifier, is_regressor | ||
from sklearn.cross_validation import train_test_split | ||
from sklearn.externals.six import PY3 | ||
from sklearn.externals.six.moves import zip | ||
from sklearn.utils import check_random_state | ||
from sklearn.utils.testing import assert_false, clean_warning_registry | ||
from sklearn.utils.testing import all_estimators | ||
from sklearn.utils.testing import assert_greater | ||
from sklearn.utils.testing import assert_in | ||
from sklearn.utils.testing import ignore_warnings | ||
from sklearn.utils.testing import set_random_state | ||
from sklearn.utils.validation import has_fit_parameter | ||
|
||
from numpy.testing import assert_array_almost_equal, assert_equal | ||
|
||
import sklearn | ||
from sklearn.cluster.bicluster import BiclusterMixin | ||
|
@@ -219,3 +229,155 @@ def test_get_params_invariance(): | |
yield check_get_params_invariance, name, Estimator | ||
else: | ||
yield check_get_params_invariance, name, Estimator | ||
#yield check_transformer_n_iter, name, Estimator() | ||
|
||
|
||
def test_sample_weight_consistency(random_state=42): | ||
exclude = [ | ||
'AdaBoostClassifier', 'AdaBoostRegressor', | ||
'BaggingClassifier', 'BaggingRegressor', | ||
'GradientBoostingClassifier', 'GradientBoostingRegressor', | ||
'ExtraTreeClassifier', 'ExtraTreeRegressor', | ||
'ExtraTreesClassifier', 'ExtraTreesRegressor', | ||
'DecisionTreeClassifier','DecisionTreeRegressor', | ||
'LogisticRegression', 'LogisticRegressionCV', 'LinearSVC', | ||
'MultinomialNB', # Requires positive samples | ||
'CalibratedClassifierCV', # This is a meta-estimator using LinearSVC | ||
'SGDClassifier', # Doesn't work. Probably more data needed | ||
'SGDRegressor', # Doesn't work. Probably more data needed | ||
'Perceptron', # Uses SGD too. Doesn't work. Probably more data needed | ||
'RidgeClassifierCV', 'RidgeCV', | ||
'RandomForestClassifier', 'RandomForestRegressor', | ||
] | ||
estimators = all_estimators() | ||
|
||
n_samples, n_features = 20, 5 | ||
rng = check_random_state(random_state) | ||
|
||
sample_weight = rng.randint(1, 4, (n_samples,)) | ||
|
||
X_clf, y_clf = datasets.make_classification( | ||
n_samples=n_samples, n_features=n_features, | ||
random_state=random_state) | ||
X_reg, y_reg = datasets.make_regression( | ||
n_samples=n_samples, n_features=n_features, | ||
n_informative=2, random_state=random_state) | ||
|
||
def aug(data, sample_weight): | ||
# raise all samples to multiplicity of the corresponding sampleweight | ||
aug_data = [] | ||
for samples, weight in zip(zip(*data), sample_weight): | ||
for _ in range(weight): | ||
aug_data.append(samples) | ||
aug_data = map(np.array, zip(*aug_data)) | ||
return aug_data | ||
|
||
train, test = train_test_split(range(n_samples)) | ||
|
||
for name, Estimator in estimators: | ||
print ("%s is being analysed" % name) | ||
if name in exclude: | ||
print ("%s is being excluded" % name) | ||
continue | ||
if not has_fit_parameter(Estimator, 'sample_weight'): | ||
continue | ||
if is_classifier(Estimator): | ||
X, y = X_clf, y_clf | ||
elif is_regressor(Estimator): | ||
X, y = X_reg, y_reg | ||
else: | ||
print ("%s is neither classifier nor regressor" % name) | ||
continue | ||
|
||
estimator_sw = Estimator() | ||
set_random_state(estimator_sw, random_state=random_state) | ||
estimator_sw.fit(X[train], y[train], sample_weight=sample_weight[train]) | ||
X_aug_train, y_aug_train = aug((X[train], y[train]), | ||
sample_weight[train]) | ||
assert_equal(X_aug_train.shape[0], np.sum(sample_weight[train])) | ||
|
||
estimator_aug = Estimator() | ||
set_random_state(estimator_aug, random_state=random_state) | ||
estimator_aug.fit(X_aug_train, y_aug_train) | ||
|
||
precision = 6 | ||
# if estimator has `coef_` attribute, then compare the two | ||
if hasattr(estimator_sw, 'coef_'): | ||
yield (assert_array_almost_equal, | ||
estimator_sw.coef_, estimator_aug.coef_, precision, name+' coef_ not equal') | ||
|
||
pred_sw = estimator_sw.predict(X[test]) | ||
pred_aug = estimator_aug.predict(X[test]) | ||
|
||
yield assert_array_almost_equal, pred_sw, pred_aug, precision, name+' prediction not equal' | ||
|
||
|
||
def test_sample_weight_0(random_state=42): | ||
exclude = [ | ||
'AdaBoostClassifier', 'AdaBoostRegressor', | ||
'BaggingClassifier', 'BaggingRegressor', | ||
'GradientBoostingClassifier', 'GradientBoostingRegressor', | ||
'ExtraTreeClassifier', 'ExtraTreeRegressor', | ||
'ExtraTreesClassifier', 'ExtraTreesRegressor', | ||
'DecisionTreeClassifier','DecisionTreeRegressor', | ||
'LogisticRegression', 'LogisticRegressionCV', 'LinearSVC', | ||
'MultinomialNB', # Requires positive samples | ||
'CalibratedClassifierCV', # This is a meta-estimator using LinearSVC | ||
'SGDClassifier', # Doesn't work. Probably more data needed | ||
'SGDRegressor', # Doesn't work. Probably more data needed | ||
'Perceptron', # Uses SGD too. Doesn't work. Probably more data needed | ||
'RidgeClassifierCV', 'RidgeCV', | ||
'RandomForestClassifier', 'RandomForestRegressor', | ||
] | ||
estimators = all_estimators() | ||
|
||
n_samples, n_features = 20, 5 | ||
rng = check_random_state(random_state) | ||
|
||
sample_weight = (rng.permutation(n_samples) < (n_samples / 2.)).astype(np.float) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I was not suggesting to do that. I think There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. well, you could have a dataset with some duplicate samples and use sample-weight 0.5 on it and compare against the data without duplicates? Or use 1.5 and compare it with data that has triples? |
||
|
||
X_clf, y_clf = datasets.make_classification( | ||
n_samples=n_samples, n_features=n_features, | ||
random_state=random_state) | ||
X_reg, y_reg = datasets.make_regression( | ||
n_samples=n_samples, n_features=n_features, | ||
n_informative=2, random_state=random_state) | ||
|
||
train, test = train_test_split(range(n_samples)) | ||
|
||
for name, Estimator in estimators: | ||
print ("%s is being analysed" % name) | ||
if name in exclude: | ||
print ("%s is being excluded" % name) | ||
continue | ||
if not has_fit_parameter(Estimator, 'sample_weight'): | ||
continue | ||
if is_classifier(Estimator): | ||
X, y = X_clf, y_clf | ||
elif is_regressor(Estimator): | ||
X, y = X_reg, y_reg | ||
else: | ||
print ("%s is neither classifier nor regressor" % name) | ||
continue | ||
|
||
estimator_sw = Estimator() | ||
set_random_state(estimator_sw, random_state=random_state) | ||
estimator_sw.fit(X[train], y[train], sample_weight=sample_weight[train]) | ||
X_aug_train, y_aug_train = X[train][sample_weight[train]==1], \ | ||
y[train][sample_weight[train]==1] | ||
estimator_aug = Estimator() | ||
set_random_state(estimator_aug, random_state=random_state) | ||
estimator_aug.fit(X_aug_train, y_aug_train) | ||
|
||
precision = 6 | ||
# if estimator has `coef_` attribute, then compare the two | ||
if hasattr(estimator_sw, 'coef_'): | ||
yield (assert_array_almost_equal, | ||
estimator_sw.coef_, estimator_aug.coef_, precision, name+' coef_ not equal') | ||
|
||
pred_sw = estimator_sw.predict(X[test]) | ||
pred_aug = estimator_aug.predict(X[test]) | ||
|
||
yield assert_array_almost_equal, pred_sw, pred_aug, precision, name+' prediction not equal' | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do you think a test for
aug
would make sense here? Something like