diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py index 5a28b31b33c2f..16fccb283eff1 100644 --- a/sklearn/tests/test_common.py +++ b/sklearn/tests/test_common.py @@ -11,13 +11,23 @@ import warnings import sys import pkgutil +import numpy as np +from sklearn import datasets +from sklearn.base import is_classifier, is_regressor +from sklearn.cross_validation import train_test_split from sklearn.externals.six import PY3 +from sklearn.externals.six.moves import zip +from sklearn.utils import check_random_state from sklearn.utils.testing import assert_false, clean_warning_registry from sklearn.utils.testing import all_estimators from sklearn.utils.testing import assert_greater from sklearn.utils.testing import assert_in from sklearn.utils.testing import ignore_warnings +from sklearn.utils.testing import set_random_state +from sklearn.utils.validation import has_fit_parameter + +from numpy.testing import assert_array_almost_equal, assert_equal import sklearn from sklearn.cluster.bicluster import BiclusterMixin @@ -219,3 +229,155 @@ def test_get_params_invariance(): yield check_get_params_invariance, name, Estimator else: yield check_get_params_invariance, name, Estimator + #yield check_transformer_n_iter, name, Estimator() + + +def test_sample_weight_consistency(random_state=42): + exclude = [ + 'AdaBoostClassifier', 'AdaBoostRegressor', + 'BaggingClassifier', 'BaggingRegressor', + 'GradientBoostingClassifier', 'GradientBoostingRegressor', + 'ExtraTreeClassifier', 'ExtraTreeRegressor', + 'ExtraTreesClassifier', 'ExtraTreesRegressor', + 'DecisionTreeClassifier','DecisionTreeRegressor', + 'LogisticRegression', 'LogisticRegressionCV', 'LinearSVC', + 'MultinomialNB', # Requires positive samples + 'CalibratedClassifierCV', # This is a meta-estimator using LinearSVC + 'SGDClassifier', # Doesn't work. Probably more data needed + 'SGDRegressor', # Doesn't work. Probably more data needed + 'Perceptron', # Uses SGD too. Doesn't work. Probably more data needed + 'RidgeClassifierCV', 'RidgeCV', + 'RandomForestClassifier', 'RandomForestRegressor', + ] + estimators = all_estimators() + + n_samples, n_features = 20, 5 + rng = check_random_state(random_state) + + sample_weight = rng.randint(1, 4, (n_samples,)) + + X_clf, y_clf = datasets.make_classification( + n_samples=n_samples, n_features=n_features, + random_state=random_state) + X_reg, y_reg = datasets.make_regression( + n_samples=n_samples, n_features=n_features, + n_informative=2, random_state=random_state) + + def aug(data, sample_weight): + # raise all samples to multiplicity of the corresponding sampleweight + aug_data = [] + for samples, weight in zip(zip(*data), sample_weight): + for _ in range(weight): + aug_data.append(samples) + aug_data = map(np.array, zip(*aug_data)) + return aug_data + + train, test = train_test_split(range(n_samples)) + + for name, Estimator in estimators: + print ("%s is being analysed" % name) + if name in exclude: + print ("%s is being excluded" % name) + continue + if not has_fit_parameter(Estimator, 'sample_weight'): + continue + if is_classifier(Estimator): + X, y = X_clf, y_clf + elif is_regressor(Estimator): + X, y = X_reg, y_reg + else: + print ("%s is neither classifier nor regressor" % name) + continue + + estimator_sw = Estimator() + set_random_state(estimator_sw, random_state=random_state) + estimator_sw.fit(X[train], y[train], sample_weight=sample_weight[train]) + X_aug_train, y_aug_train = aug((X[train], y[train]), + sample_weight[train]) + assert_equal(X_aug_train.shape[0], np.sum(sample_weight[train])) + + estimator_aug = Estimator() + set_random_state(estimator_aug, random_state=random_state) + estimator_aug.fit(X_aug_train, y_aug_train) + + precision = 6 + # if estimator has `coef_` attribute, then compare the two + if hasattr(estimator_sw, 'coef_'): + yield (assert_array_almost_equal, + estimator_sw.coef_, estimator_aug.coef_, precision, name+' coef_ not equal') + + pred_sw = estimator_sw.predict(X[test]) + pred_aug = estimator_aug.predict(X[test]) + + yield assert_array_almost_equal, pred_sw, pred_aug, precision, name+' prediction not equal' + + +def test_sample_weight_0(random_state=42): + exclude = [ + 'AdaBoostClassifier', 'AdaBoostRegressor', + 'BaggingClassifier', 'BaggingRegressor', + 'GradientBoostingClassifier', 'GradientBoostingRegressor', + 'ExtraTreeClassifier', 'ExtraTreeRegressor', + 'ExtraTreesClassifier', 'ExtraTreesRegressor', + 'DecisionTreeClassifier','DecisionTreeRegressor', + 'LogisticRegression', 'LogisticRegressionCV', 'LinearSVC', + 'MultinomialNB', # Requires positive samples + 'CalibratedClassifierCV', # This is a meta-estimator using LinearSVC + 'SGDClassifier', # Doesn't work. Probably more data needed + 'SGDRegressor', # Doesn't work. Probably more data needed + 'Perceptron', # Uses SGD too. Doesn't work. Probably more data needed + 'RidgeClassifierCV', 'RidgeCV', + 'RandomForestClassifier', 'RandomForestRegressor', + ] + estimators = all_estimators() + + n_samples, n_features = 20, 5 + rng = check_random_state(random_state) + + sample_weight = (rng.permutation(n_samples) < (n_samples / 2.)).astype(np.float) + + X_clf, y_clf = datasets.make_classification( + n_samples=n_samples, n_features=n_features, + random_state=random_state) + X_reg, y_reg = datasets.make_regression( + n_samples=n_samples, n_features=n_features, + n_informative=2, random_state=random_state) + + train, test = train_test_split(range(n_samples)) + + for name, Estimator in estimators: + print ("%s is being analysed" % name) + if name in exclude: + print ("%s is being excluded" % name) + continue + if not has_fit_parameter(Estimator, 'sample_weight'): + continue + if is_classifier(Estimator): + X, y = X_clf, y_clf + elif is_regressor(Estimator): + X, y = X_reg, y_reg + else: + print ("%s is neither classifier nor regressor" % name) + continue + + estimator_sw = Estimator() + set_random_state(estimator_sw, random_state=random_state) + estimator_sw.fit(X[train], y[train], sample_weight=sample_weight[train]) + X_aug_train, y_aug_train = X[train][sample_weight[train]==1], \ + y[train][sample_weight[train]==1] + estimator_aug = Estimator() + set_random_state(estimator_aug, random_state=random_state) + estimator_aug.fit(X_aug_train, y_aug_train) + + precision = 6 + # if estimator has `coef_` attribute, then compare the two + if hasattr(estimator_sw, 'coef_'): + yield (assert_array_almost_equal, + estimator_sw.coef_, estimator_aug.coef_, precision, name+' coef_ not equal') + + pred_sw = estimator_sw.predict(X[test]) + pred_aug = estimator_aug.predict(X[test]) + + yield assert_array_almost_equal, pred_sw, pred_aug, precision, name+' prediction not equal' + +