FIX scikit-learn#3485: class_weight='auto' on SGDClassifier

ogrisel · ogrisel · commit 07560e45c234 · 2014-08-01T15:05:32.000+02:00
diff --git a/doc/whats_new.rst b/doc/whats_new.rst
@@ -105,6 +105,10 @@ Bug fixes
      and ``pandas.DataFrame`` in recent versions of pandas. By
      `Gael Varoquaux`_.
 
+   - Fixed a regression for :class:`linear_model.SGDClassifier` with
+     ``class_weight="auto"`` on data with non-contiguous labels. By
+     `Olivier Grisel`_.
+
 
 .. _changes_0_15:
 
diff --git a/sklearn/linear_model/stochastic_gradient.py b/sklearn/linear_model/stochastic_gradient.py
@@ -327,10 +327,8 @@ def _partial_fit(self, X, y, alpha, C,
         n_classes = self.classes_.shape[0]
 
         # Allocate datastructures from input arguments
-        y_ind = np.searchsorted(self.classes_, y)   # XXX use a LabelBinarizer?
         self._expanded_class_weight = compute_class_weight(self.class_weight,
-                                                           self.classes_,
-                                                           y_ind)
+                                                           self.classes_, y)
         sample_weight = self._validate_sample_weight(sample_weight, n_samples)
 
         if self.coef_ is None or coef_init is not None:
diff --git a/sklearn/linear_model/tests/test_sgd.py b/sklearn/linear_model/tests/test_sgd.py
@@ -847,7 +847,7 @@ def test_underflow_or_overlow():
     assert_array_equal(np.unique(y), [0, 1])
 
     model = SGDClassifier(alpha=0.1, loss='squared_hinge', n_iter=500)
-    
+
     # smoke test: model is stable on scaled data
     model.fit(scale(X), y)
 
diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py
@@ -26,6 +26,7 @@
 from sklearn.datasets import make_classification
 
 from sklearn.cross_validation import train_test_split
+from sklearn.linear_model.base import LinearClassifierMixin
 from sklearn.utils.estimator_checks import (
     check_parameters_default_constructible,
     check_regressors_classifiers_sparse_data,
@@ -44,6 +45,7 @@
     check_classifiers_pickle,
     check_class_weight_classifiers,
     check_class_weight_auto_classifiers,
+    check_class_weight_auto_linear_classifier,
     check_estimators_overwrite_params,
     check_cluster_overwrite_params,
     check_sparsify_binary_classifier,
@@ -214,7 +216,7 @@ def test_class_weight_classifiers():
         yield check_class_weight_classifiers, name, Classifier
 
 
-def test_class_weight_auto_classifies():
+def test_class_weight_auto_classifiers():
     """Test that class_weight="auto" improves f1-score"""
 
     # This test is broken; its success depends on:
@@ -251,6 +253,26 @@ def test_class_weight_auto_classifies():
                        X_train, y_train, X_test, y_test, weights)
 
 
+def test_class_weight_auto_linear_classifiers():
+    classifiers = all_estimators(type_filter='classifier')
+
+    with warnings.catch_warnings(record=True):
+        linear_classifiers = [
+            (name, clazz)
+            for name, clazz in classifiers
+            if 'class_weight' in clazz().get_params().keys()
+               and issubclass(clazz, LinearClassifierMixin)]
+
+    for name, Classifier in linear_classifiers:
+        if name == "LogisticRegressionCV":
+            # Contrary to RidgeClassifierCV, LogisticRegressionCV use actual
+            # CV folds and fit a model for each CV iteration before averaging
+            # the coef. Therefore it is expected to not behave exactly as the
+            # other linear model.
+            continue
+        yield check_class_weight_auto_linear_classifier, name, Classifier
+
+
 def test_estimators_overwrite_params():
     # test whether any classifier overwrites his init parameters during fit
     for est_type in ["classifier", "regressor", "transformer"]:
diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
@@ -725,6 +725,36 @@ def check_class_weight_auto_classifiers(name, Classifier, X_train, y_train,
                    f1_score(y_test, y_pred))
 
 
+def check_class_weight_auto_linear_classifier(name, Classifier):
+    """Test class weights with non-contiguous class labels."""
+    X = np.array([[-1.0, -1.0], [-1.0, 0], [-.8, -1.0],
+                  [1.0, 1.0], [1.0, 0.0]])
+    y = [1, 1, 1, -1, -1]
+
+    with warnings.catch_warnings(record=True):
+        classifier = Classifier()
+    if hasattr(classifier, "n_iter"):
+        # This is a very small dataset, default n_iter are likely to prevent
+        # convergence
+        classifier.set_params(n_iter=1000)
+    set_random_state(classifier)
+
+    # Let the model compute the class frequencies
+    classifier.set_params(class_weight='auto')
+    coef_auto = classifier.fit(X, y).coef_.copy()
+
+    # Count each label occurrence to reweight manually
+    mean_weight = (1. / 3 + 1. / 2) / 2
+    class_weight = {
+        1: 1. / 3 / mean_weight,
+        -1: 1. / 2 / mean_weight,
+    }
+    classifier.set_params(class_weight=class_weight)
+    coef_manual = classifier.fit(X, y).coef_.copy()
+
+    assert_array_almost_equal(coef_auto, coef_manual)
+
+
 def check_estimators_overwrite_params(name, Estimator):
     X, y = make_blobs(random_state=0, n_samples=9)
     y = multioutput_estimator_convert_y_2d(name, y)