Merge pull request scikit-learn#4345 from bendavies/20newsgroups_example

amueller · amueller · commit 26cd8068898e · 2015-03-05T10:41:25.000-06:00
update 20newsgroups text classification example for best practice
diff --git a/examples/text/document_classification_20newsgroups.py b/examples/text/document_classification_20newsgroups.py
@@ -36,6 +36,7 @@
 from sklearn.feature_extraction.text import HashingVectorizer
 from sklearn.feature_selection import SelectKBest, chi2
 from sklearn.linear_model import RidgeClassifier
+from sklearn.pipeline import Pipeline
 from sklearn.svm import LinearSVC
 from sklearn.linear_model import SGDClassifier
 from sklearn.linear_model import Perceptron
@@ -276,25 +277,14 @@ def benchmark(clf):
 results.append(benchmark(MultinomialNB(alpha=.01)))
 results.append(benchmark(BernoulliNB(alpha=.01)))
 
-
-class L1LinearSVC(LinearSVC):
-
-    def fit(self, X, y):
-        # The smaller C, the stronger the regularization.
-        # The more regularization, the more sparsity.
-        self.transformer_ = LinearSVC(penalty="l1",
-                                      dual=False, tol=1e-3)
-        X = self.transformer_.fit_transform(X, y)
-        return LinearSVC.fit(self, X, y)
-
-    def predict(self, X):
-        X = self.transformer_.transform(X)
-        return LinearSVC.predict(self, X)
-
 print('=' * 80)
 print("LinearSVC with L1-based feature selection")
-results.append(benchmark(L1LinearSVC()))
-
+# The smaller C, the stronger the regularization.
+# The more regularization, the more sparsity.
+results.append(benchmark(Pipeline([
+  ('feature_selection', LinearSVC(penalty="l1", dual=False, tol=1e-3)),
+  ('classification', LinearSVC())
+])))
 
 # make some plots