scikit-learn · ainafp · Oct 19, 2015 · Oct 19, 2015 · Oct 21, 2015 · Oct 21, 2015
diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py
@@ -11,13 +11,23 @@
 import warnings
 import sys
 import pkgutil
+import numpy as np
 
+from sklearn import datasets
+from sklearn.base import is_classifier, is_regressor
+from sklearn.cross_validation import train_test_split
 from sklearn.externals.six import PY3
+from sklearn.externals.six.moves import zip
+from sklearn.utils import check_random_state
 from sklearn.utils.testing import assert_false, clean_warning_registry
 from sklearn.utils.testing import all_estimators
 from sklearn.utils.testing import assert_greater
 from sklearn.utils.testing import assert_in
 from sklearn.utils.testing import ignore_warnings
+from sklearn.utils.testing import set_random_state
+from sklearn.utils.validation import has_fit_parameter
+
+from numpy.testing import assert_array_almost_equal, assert_equal
 
 import sklearn
 from sklearn.cluster.bicluster import BiclusterMixin
@@ -219,3 +229,155 @@ def test_get_params_invariance():
                     yield check_get_params_invariance, name, Estimator
             else:
                 yield check_get_params_invariance, name, Estimator
+            #yield check_transformer_n_iter, name, Estimator()
+
+
+def test_sample_weight_consistency(random_state=42):
+    exclude = [
+               'AdaBoostClassifier', 'AdaBoostRegressor', 
+               'BaggingClassifier', 'BaggingRegressor',
+               'GradientBoostingClassifier', 'GradientBoostingRegressor', 
+               'ExtraTreeClassifier', 'ExtraTreeRegressor',
+               'ExtraTreesClassifier', 'ExtraTreesRegressor',
+               'DecisionTreeClassifier','DecisionTreeRegressor',
+               'LogisticRegression', 'LogisticRegressionCV', 'LinearSVC',
+               'MultinomialNB',  # Requires positive samples
+               'CalibratedClassifierCV',  # This is a meta-estimator using LinearSVC
+               'SGDClassifier',  # Doesn't work. Probably more data needed
+               'SGDRegressor',  # Doesn't work. Probably more data needed
+               'Perceptron',  # Uses SGD too. Doesn't work. Probably more data needed
+               'RidgeClassifierCV', 'RidgeCV', 
+               'RandomForestClassifier', 'RandomForestRegressor',
+               ]
+    estimators = all_estimators()
+
+    n_samples, n_features = 20, 5
+    rng = check_random_state(random_state)
+
+    sample_weight = rng.randint(1, 4, (n_samples,))
+
+    X_clf, y_clf = datasets.make_classification(
+        n_samples=n_samples, n_features=n_features,
+        random_state=random_state)
+    X_reg, y_reg = datasets.make_regression(
+        n_samples=n_samples, n_features=n_features,
+        n_informative=2, random_state=random_state)
+
+    def aug(data, sample_weight):
+        # raise all samples to multiplicity of the corresponding sampleweight
+        aug_data = []
+        for samples, weight in zip(zip(*data), sample_weight):
+            for _ in range(weight):
+                aug_data.append(samples)
+        aug_data = map(np.array, zip(*aug_data))
+        return aug_data
+
+    train, test = train_test_split(range(n_samples))
+
+    for name, Estimator in estimators:
+        print ("%s is being analysed" % name)
+        if name in exclude:
+            print ("%s is being excluded" % name)
+            continue
+        if not has_fit_parameter(Estimator, 'sample_weight'):
+            continue
+        if is_classifier(Estimator):
+            X, y = X_clf, y_clf
+        elif is_regressor(Estimator):
+            X, y = X_reg, y_reg
+        else:
+            print ("%s is neither classifier nor regressor" % name)
+            continue
+
+        estimator_sw = Estimator()
+        set_random_state(estimator_sw, random_state=random_state)
+        estimator_sw.fit(X[train], y[train], sample_weight=sample_weight[train])
+        X_aug_train, y_aug_train = aug((X[train], y[train]),
+                                   sample_weight[train])
+        assert_equal(X_aug_train.shape[0], np.sum(sample_weight[train]))
+
+        estimator_aug = Estimator()
+        set_random_state(estimator_aug, random_state=random_state)
+        estimator_aug.fit(X_aug_train, y_aug_train)
+
+        precision = 6
+        # if estimator has `coef_` attribute, then compare the two
+        if hasattr(estimator_sw, 'coef_'):
+            yield (assert_array_almost_equal,
+                   estimator_sw.coef_, estimator_aug.coef_, precision, name+' coef_ not equal')
+
+        pred_sw = estimator_sw.predict(X[test])
+        pred_aug = estimator_aug.predict(X[test])
+
+        yield assert_array_almost_equal, pred_sw, pred_aug, precision, name+' prediction not equal'
+
+
+def test_sample_weight_0(random_state=42):
+    exclude = [
+               'AdaBoostClassifier', 'AdaBoostRegressor',
+               'BaggingClassifier', 'BaggingRegressor',
+               'GradientBoostingClassifier', 'GradientBoostingRegressor',
+               'ExtraTreeClassifier', 'ExtraTreeRegressor',
+               'ExtraTreesClassifier', 'ExtraTreesRegressor',
+               'DecisionTreeClassifier','DecisionTreeRegressor',
+               'LogisticRegression', 'LogisticRegressionCV', 'LinearSVC',
+               'MultinomialNB',  # Requires positive samples
+               'CalibratedClassifierCV',  # This is a meta-estimator using LinearSVC
+               'SGDClassifier',  # Doesn't work. Probably more data needed
+               'SGDRegressor',  # Doesn't work. Probably more data needed
+               'Perceptron',  # Uses SGD too. Doesn't work. Probably more data needed
+               'RidgeClassifierCV', 'RidgeCV',
+               'RandomForestClassifier', 'RandomForestRegressor',
+               ]
+    estimators = all_estimators()
+
+    n_samples, n_features = 20, 5
+    rng = check_random_state(random_state)
+
+    sample_weight = (rng.permutation(n_samples) < (n_samples / 2.)).astype(np.float)
+
+    X_clf, y_clf = datasets.make_classification(
+        n_samples=n_samples, n_features=n_features,
+        random_state=random_state)
+    X_reg, y_reg = datasets.make_regression(
+        n_samples=n_samples, n_features=n_features,
+        n_informative=2, random_state=random_state)
+
+    train, test = train_test_split(range(n_samples))
+
+    for name, Estimator in estimators:
+        print ("%s is being analysed" % name)
+        if name in exclude:
+            print ("%s is being excluded" % name)
+            continue
+        if not has_fit_parameter(Estimator, 'sample_weight'):
+            continue
+        if is_classifier(Estimator):
+            X, y = X_clf, y_clf
+        elif is_regressor(Estimator):
+            X, y = X_reg, y_reg
+        else:
+            print ("%s is neither classifier nor regressor" % name)
+            continue
+
+        estimator_sw = Estimator()
+        set_random_state(estimator_sw, random_state=random_state)
+        estimator_sw.fit(X[train], y[train], sample_weight=sample_weight[train])
+        X_aug_train, y_aug_train = X[train][sample_weight[train]==1], \
+                                   y[train][sample_weight[train]==1]
+        estimator_aug = Estimator()
+        set_random_state(estimator_aug, random_state=random_state)
+        estimator_aug.fit(X_aug_train, y_aug_train)
+
+        precision = 6
+        # if estimator has `coef_` attribute, then compare the two
+        if hasattr(estimator_sw, 'coef_'):
+            yield (assert_array_almost_equal,
+                   estimator_sw.coef_, estimator_aug.coef_, precision, name+' coef_ not equal')
+
+        pred_sw = estimator_sw.predict(X[test])
+        pred_aug = estimator_aug.predict(X[test])
+
+        yield assert_array_almost_equal, pred_sw, pred_aug, precision, name+' prediction not equal'
+
+