Thanks to visit codestin.com
Credit goes to github.com

Skip to content

[WIP] Sample weight consistency #5515

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 17 commits into from
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
162 changes: 162 additions & 0 deletions sklearn/tests/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,23 @@
import warnings
import sys
import pkgutil
import numpy as np

from sklearn import datasets
from sklearn.base import is_classifier, is_regressor
from sklearn.cross_validation import train_test_split
from sklearn.externals.six import PY3
from sklearn.externals.six.moves import zip
from sklearn.utils import check_random_state
from sklearn.utils.testing import assert_false, clean_warning_registry
from sklearn.utils.testing import all_estimators
from sklearn.utils.testing import assert_greater
from sklearn.utils.testing import assert_in
from sklearn.utils.testing import ignore_warnings
from sklearn.utils.testing import set_random_state
from sklearn.utils.validation import has_fit_parameter

from numpy.testing import assert_array_almost_equal, assert_equal

import sklearn
from sklearn.cluster.bicluster import BiclusterMixin
Expand Down Expand Up @@ -219,3 +229,155 @@ def test_get_params_invariance():
yield check_get_params_invariance, name, Estimator
else:
yield check_get_params_invariance, name, Estimator
#yield check_transformer_n_iter, name, Estimator()


def test_sample_weight_consistency(random_state=42):
exclude = [
'AdaBoostClassifier', 'AdaBoostRegressor',
'BaggingClassifier', 'BaggingRegressor',
'GradientBoostingClassifier', 'GradientBoostingRegressor',
'ExtraTreeClassifier', 'ExtraTreeRegressor',
'ExtraTreesClassifier', 'ExtraTreesRegressor',
'DecisionTreeClassifier','DecisionTreeRegressor',
'LogisticRegression', 'LogisticRegressionCV', 'LinearSVC',
'MultinomialNB', # Requires positive samples
'CalibratedClassifierCV', # This is a meta-estimator using LinearSVC
'SGDClassifier', # Doesn't work. Probably more data needed
'SGDRegressor', # Doesn't work. Probably more data needed
'Perceptron', # Uses SGD too. Doesn't work. Probably more data needed
'RidgeClassifierCV', 'RidgeCV',
'RandomForestClassifier', 'RandomForestRegressor',
]
estimators = all_estimators()

n_samples, n_features = 20, 5
rng = check_random_state(random_state)

sample_weight = rng.randint(1, 4, (n_samples,))

X_clf, y_clf = datasets.make_classification(
n_samples=n_samples, n_features=n_features,
random_state=random_state)
X_reg, y_reg = datasets.make_regression(
n_samples=n_samples, n_features=n_features,
n_informative=2, random_state=random_state)

def aug(data, sample_weight):
# raise all samples to multiplicity of the corresponding sampleweight
aug_data = []
for samples, weight in zip(zip(*data), sample_weight):
for _ in range(weight):
aug_data.append(samples)
aug_data = map(np.array, zip(*aug_data))
return aug_data

train, test = train_test_split(range(n_samples))

for name, Estimator in estimators:
print ("%s is being analysed" % name)
if name in exclude:
print ("%s is being excluded" % name)
continue
if not has_fit_parameter(Estimator, 'sample_weight'):
continue
if is_classifier(Estimator):
X, y = X_clf, y_clf
elif is_regressor(Estimator):
X, y = X_reg, y_reg
else:
print ("%s is neither classifier nor regressor" % name)
continue

estimator_sw = Estimator()
set_random_state(estimator_sw, random_state=random_state)
estimator_sw.fit(X[train], y[train], sample_weight=sample_weight[train])
X_aug_train, y_aug_train = aug((X[train], y[train]),
sample_weight[train])
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you think a test for aug would make sense here? Something like

assert_equal(X_aug_train.shape[0], np.sum(sample_weight[train]))

assert_equal(X_aug_train.shape[0], np.sum(sample_weight[train]))

estimator_aug = Estimator()
set_random_state(estimator_aug, random_state=random_state)
estimator_aug.fit(X_aug_train, y_aug_train)

precision = 6
# if estimator has `coef_` attribute, then compare the two
if hasattr(estimator_sw, 'coef_'):
yield (assert_array_almost_equal,
estimator_sw.coef_, estimator_aug.coef_, precision, name+' coef_ not equal')

pred_sw = estimator_sw.predict(X[test])
pred_aug = estimator_aug.predict(X[test])

yield assert_array_almost_equal, pred_sw, pred_aug, precision, name+' prediction not equal'


def test_sample_weight_0(random_state=42):
exclude = [
'AdaBoostClassifier', 'AdaBoostRegressor',
'BaggingClassifier', 'BaggingRegressor',
'GradientBoostingClassifier', 'GradientBoostingRegressor',
'ExtraTreeClassifier', 'ExtraTreeRegressor',
'ExtraTreesClassifier', 'ExtraTreesRegressor',
'DecisionTreeClassifier','DecisionTreeRegressor',
'LogisticRegression', 'LogisticRegressionCV', 'LinearSVC',
'MultinomialNB', # Requires positive samples
'CalibratedClassifierCV', # This is a meta-estimator using LinearSVC
'SGDClassifier', # Doesn't work. Probably more data needed
'SGDRegressor', # Doesn't work. Probably more data needed
'Perceptron', # Uses SGD too. Doesn't work. Probably more data needed
'RidgeClassifierCV', 'RidgeCV',
'RandomForestClassifier', 'RandomForestRegressor',
]
estimators = all_estimators()

n_samples, n_features = 20, 5
rng = check_random_state(random_state)

sample_weight = (rng.permutation(n_samples) < (n_samples / 2.)).astype(np.float)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was not suggesting to do that. I think int makes more sense in this case.
My argument is that your tests will not cover the situation in which learning algorithms are weighted with float sample weights, as there is not a corresponding interpretation of "weight = number of sample copy". I simply do not think there is a way to cover this situation though.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

well, you could have a dataset with some duplicate samples and use sample-weight 0.5 on it and compare against the data without duplicates? Or use 1.5 and compare it with data that has triples?


X_clf, y_clf = datasets.make_classification(
n_samples=n_samples, n_features=n_features,
random_state=random_state)
X_reg, y_reg = datasets.make_regression(
n_samples=n_samples, n_features=n_features,
n_informative=2, random_state=random_state)

train, test = train_test_split(range(n_samples))

for name, Estimator in estimators:
print ("%s is being analysed" % name)
if name in exclude:
print ("%s is being excluded" % name)
continue
if not has_fit_parameter(Estimator, 'sample_weight'):
continue
if is_classifier(Estimator):
X, y = X_clf, y_clf
elif is_regressor(Estimator):
X, y = X_reg, y_reg
else:
print ("%s is neither classifier nor regressor" % name)
continue

estimator_sw = Estimator()
set_random_state(estimator_sw, random_state=random_state)
estimator_sw.fit(X[train], y[train], sample_weight=sample_weight[train])
X_aug_train, y_aug_train = X[train][sample_weight[train]==1], \
y[train][sample_weight[train]==1]
estimator_aug = Estimator()
set_random_state(estimator_aug, random_state=random_state)
estimator_aug.fit(X_aug_train, y_aug_train)

precision = 6
# if estimator has `coef_` attribute, then compare the two
if hasattr(estimator_sw, 'coef_'):
yield (assert_array_almost_equal,
estimator_sw.coef_, estimator_aug.coef_, precision, name+' coef_ not equal')

pred_sw = estimator_sw.predict(X[test])
pred_aug = estimator_aug.predict(X[test])

yield assert_array_almost_equal, pred_sw, pred_aug, precision, name+' prediction not equal'