From 455c2870d1285b381262f9b25f21ea91eef50f26 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Fri, 9 Jun 2017 16:04:13 +0200 Subject: [PATCH 1/3] Add benchmark_text_vectorizers.py --- benchmarks/bench_text_vectorizers.py | 73 ++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) create mode 100644 benchmarks/bench_text_vectorizers.py diff --git a/benchmarks/bench_text_vectorizers.py b/benchmarks/bench_text_vectorizers.py new file mode 100644 index 0000000000000..68229c2278f15 --- /dev/null +++ b/benchmarks/bench_text_vectorizers.py @@ -0,0 +1,73 @@ +""" + +To run this benchmark, you will need, + + * scikit-learn + * pandas + * memory_profiler + * psutil (optional, but recommended) + +""" + +from __future__ import print_function + +import timeit +import itertools + +import numpy as np +import pandas as pd +from memory_profiler import memory_usage + +from sklearn.datasets import fetch_20newsgroups +from sklearn.feature_extraction.text import (CountVectorizer, TfidfVectorizer, + HashingVectorizer) + + +def run_vectorizer(Vectorizer, X, **params): + def f(): + vect = Vectorizer(**params) + vect.fit_transform(X) + return f + + +text = fetch_20newsgroups(subset='train').data + +print("="*80 + '\n#' + " Text vectorizers benchmark" + '\n' + '='*80 + '\n') +print("Using a subset of the 20 newsrgoups dataset ({} documents)." + .format(len(text))) +print("This benchmarks runs in ~20 min ...") + +res = [] + +# Wrap the result of iteritools.product with tqdm to get a progress bar +for Vectorizer, (analyzer, ngram_range) in itertools.product( + [CountVectorizer, TfidfVectorizer, HashingVectorizer], + [('word', (1, 1)), + ('word', (1, 2)), + ('word', (1, 4)), + ('char', (4, 4)), + ('char_wb', (4, 4)) + ]): + + bench = {'vectorizer': Vectorizer.__name__} + params = {'analyzer': analyzer, 'ngram_range': ngram_range} + bench.update(params) + dt = timeit.repeat(run_vectorizer(Vectorizer, text, **params), + number=1, + repeat=3) + bench['time'] = "{:.2f} (±{:.2f})".format(np.mean(dt), np.std(dt)) + + mem_usage = memory_usage(run_vectorizer(Vectorizer, text, **params)) + + bench['memory'] = "{:.1f}".format(np.max(mem_usage)) + + res.append(bench) + + +df = pd.DataFrame(res).set_index(['analyzer', 'ngram_range', 'vectorizer']) + +print('\n========== Run time performance (sec) ===========\n') +print(df['time'].unstack(level=-1)) + +print('\n=============== Memory usage (MB) ===============\n') +print(df['memory'].unstack(level=-1)) From fcdec51d05fa408598bc595c5536ad1d48b9aeff Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Sun, 11 Jun 2017 11:28:56 +0200 Subject: [PATCH 2/3] Add encoding information and remove comment --- benchmarks/bench_text_vectorizers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/bench_text_vectorizers.py b/benchmarks/bench_text_vectorizers.py index 68229c2278f15..9554a434e77ee 100644 --- a/benchmarks/bench_text_vectorizers.py +++ b/benchmarks/bench_text_vectorizers.py @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*- """ To run this benchmark, you will need, @@ -39,7 +40,6 @@ def f(): res = [] -# Wrap the result of iteritools.product with tqdm to get a progress bar for Vectorizer, (analyzer, ngram_range) in itertools.product( [CountVectorizer, TfidfVectorizer, HashingVectorizer], [('word', (1, 1)), From ce66d1e1340331402d94ed233cc5ff12c7be60f1 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Sun, 11 Jun 2017 12:38:48 +0200 Subject: [PATCH 3/3] Review comments --- benchmarks/bench_text_vectorizers.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/benchmarks/bench_text_vectorizers.py b/benchmarks/bench_text_vectorizers.py index 9554a434e77ee..194eba7ea2e82 100644 --- a/benchmarks/bench_text_vectorizers.py +++ b/benchmarks/bench_text_vectorizers.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- """ To run this benchmark, you will need, @@ -23,6 +22,8 @@ from sklearn.feature_extraction.text import (CountVectorizer, TfidfVectorizer, HashingVectorizer) +n_repeat = 3 + def run_vectorizer(Vectorizer, X, **params): def f(): @@ -54,8 +55,8 @@ def f(): bench.update(params) dt = timeit.repeat(run_vectorizer(Vectorizer, text, **params), number=1, - repeat=3) - bench['time'] = "{:.2f} (±{:.2f})".format(np.mean(dt), np.std(dt)) + repeat=n_repeat) + bench['time'] = "{:.2f} (+-{:.2f})".format(np.mean(dt), np.std(dt)) mem_usage = memory_usage(run_vectorizer(Vectorizer, text, **params)) @@ -67,6 +68,8 @@ def f(): df = pd.DataFrame(res).set_index(['analyzer', 'ngram_range', 'vectorizer']) print('\n========== Run time performance (sec) ===========\n') +print('Computing the mean and the standard deviation ' + 'of the run time over {} runs...\n'.format(n_repeat)) print(df['time'].unstack(level=-1)) print('\n=============== Memory usage (MB) ===============\n')