Thanks to visit codestin.com
Credit goes to github.com

Skip to content

[MRG+1] Pytest parametrize part3 - feature_extraction, gaussian_process modules #11143

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Jun 4, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 27 additions & 25 deletions sklearn/feature_extraction/tests/test_dict_vectorizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,43 +5,45 @@
from random import Random
import numpy as np
import scipy.sparse as sp

from numpy.testing import assert_array_equal

import pytest

from sklearn.utils.testing import (assert_equal, assert_in,
assert_false, assert_true)

from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_selection import SelectKBest, chi2


def test_dictvectorizer():
@pytest.mark.parametrize('sparse', (True, False))
@pytest.mark.parametrize('dtype', (int, np.float32, np.int16))
@pytest.mark.parametrize('sort', (True, False))
@pytest.mark.parametrize('iterable', (True, False))
def test_dictvectorizer(sparse, dtype, sort, iterable):
D = [{"foo": 1, "bar": 3},
{"bar": 4, "baz": 2},
{"bar": 1, "quux": 1, "quuux": 2}]

for sparse in (True, False):
for dtype in (int, np.float32, np.int16):
for sort in (True, False):
for iterable in (True, False):
v = DictVectorizer(sparse=sparse, dtype=dtype, sort=sort)
X = v.fit_transform(iter(D) if iterable else D)

assert_equal(sp.issparse(X), sparse)
assert_equal(X.shape, (3, 5))
assert_equal(X.sum(), 14)
assert_equal(v.inverse_transform(X), D)

if sparse:
# CSR matrices can't be compared for equality
assert_array_equal(X.A, v.transform(iter(D) if iterable
else D).A)
else:
assert_array_equal(X, v.transform(iter(D) if iterable
else D))

if sort:
assert_equal(v.feature_names_,
sorted(v.feature_names_))
v = DictVectorizer(sparse=sparse, dtype=dtype, sort=sort)
X = v.fit_transform(iter(D) if iterable else D)

assert_equal(sp.issparse(X), sparse)
assert_equal(X.shape, (3, 5))
assert_equal(X.sum(), 14)
assert_equal(v.inverse_transform(X), D)

if sparse:
# CSR matrices can't be compared for equality
assert_array_equal(X.A, v.transform(iter(D) if iterable
else D).A)
else:
assert_array_equal(X, v.transform(iter(D) if iterable
else D))

if sort:
assert_equal(v.feature_names_,
sorted(v.feature_names_))


def test_feature_selection():
Expand Down
136 changes: 66 additions & 70 deletions sklearn/feature_extraction/tests/test_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,42 +115,42 @@ def test_to_ascii():
assert_equal(strip_accents_ascii(a), expected)


def test_word_analyzer_unigrams():
for Vectorizer in (CountVectorizer, HashingVectorizer):
wa = Vectorizer(strip_accents='ascii').build_analyzer()
text = ("J'ai mang\xe9 du kangourou ce midi, "
"c'\xe9tait pas tr\xeas bon.")
expected = ['ai', 'mange', 'du', 'kangourou', 'ce', 'midi',
'etait', 'pas', 'tres', 'bon']
assert_equal(wa(text), expected)

text = "This is a test, really.\n\n I met Harry yesterday."
expected = ['this', 'is', 'test', 'really', 'met', 'harry',
'yesterday']
assert_equal(wa(text), expected)

wa = Vectorizer(input='file').build_analyzer()
text = StringIO("This is a test with a file-like object!")
expected = ['this', 'is', 'test', 'with', 'file', 'like',
'object']
assert_equal(wa(text), expected)

# with custom preprocessor
wa = Vectorizer(preprocessor=uppercase).build_analyzer()
text = ("J'ai mang\xe9 du kangourou ce midi, "
" c'\xe9tait pas tr\xeas bon.")
expected = ['AI', 'MANGE', 'DU', 'KANGOUROU', 'CE', 'MIDI',
'ETAIT', 'PAS', 'TRES', 'BON']
assert_equal(wa(text), expected)

# with custom tokenizer
wa = Vectorizer(tokenizer=split_tokenize,
strip_accents='ascii').build_analyzer()
text = ("J'ai mang\xe9 du kangourou ce midi, "
"c'\xe9tait pas tr\xeas bon.")
expected = ["j'ai", 'mange', 'du', 'kangourou', 'ce', 'midi,',
"c'etait", 'pas', 'tres', 'bon.']
assert_equal(wa(text), expected)
@pytest.mark.parametrize('Vectorizer', (CountVectorizer, HashingVectorizer))
def test_word_analyzer_unigrams(Vectorizer):
wa = Vectorizer(strip_accents='ascii').build_analyzer()
text = ("J'ai mang\xe9 du kangourou ce midi, "
"c'\xe9tait pas tr\xeas bon.")
expected = ['ai', 'mange', 'du', 'kangourou', 'ce', 'midi',
'etait', 'pas', 'tres', 'bon']
assert_equal(wa(text), expected)

text = "This is a test, really.\n\n I met Harry yesterday."
expected = ['this', 'is', 'test', 'really', 'met', 'harry',
'yesterday']
assert_equal(wa(text), expected)

wa = Vectorizer(input='file').build_analyzer()
text = StringIO("This is a test with a file-like object!")
expected = ['this', 'is', 'test', 'with', 'file', 'like',
'object']
assert_equal(wa(text), expected)

# with custom preprocessor
wa = Vectorizer(preprocessor=uppercase).build_analyzer()
text = ("J'ai mang\xe9 du kangourou ce midi, "
" c'\xe9tait pas tr\xeas bon.")
expected = ['AI', 'MANGE', 'DU', 'KANGOUROU', 'CE', 'MIDI',
'ETAIT', 'PAS', 'TRES', 'BON']
assert_equal(wa(text), expected)

# with custom tokenizer
wa = Vectorizer(tokenizer=split_tokenize,
strip_accents='ascii').build_analyzer()
text = ("J'ai mang\xe9 du kangourou ce midi, "
"c'\xe9tait pas tr\xeas bon.")
expected = ["j'ai", 'mange', 'du', 'kangourou', 'ce', 'midi,',
"c'etait", 'pas', 'tres', 'bon.']
assert_equal(wa(text), expected)


def test_word_analyzer_unigrams_and_bigrams():
Expand Down Expand Up @@ -574,22 +574,17 @@ def test_feature_names():
assert_equal(idx, cv.vocabulary_.get(name))


def test_vectorizer_max_features():
vec_factories = (
CountVectorizer,
TfidfVectorizer,
)

@pytest.mark.parametrize('Vectorizer', (CountVectorizer, TfidfVectorizer))
def test_vectorizer_max_features(Vectorizer):
expected_vocabulary = set(['burger', 'beer', 'salad', 'pizza'])
expected_stop_words = set([u'celeri', u'tomato', u'copyright', u'coke',
u'sparkling', u'water', u'the'])

for vec_factory in vec_factories:
# test bounded number of extracted features
vectorizer = vec_factory(max_df=0.6, max_features=4)
vectorizer.fit(ALL_FOOD_DOCS)
assert_equal(set(vectorizer.vocabulary_), expected_vocabulary)
assert_equal(vectorizer.stop_words_, expected_stop_words)
# test bounded number of extracted features
vectorizer = Vectorizer(max_df=0.6, max_features=4)
vectorizer.fit(ALL_FOOD_DOCS)
assert_equal(set(vectorizer.vocabulary_), expected_vocabulary)
assert_equal(vectorizer.stop_words_, expected_stop_words)


def test_count_vectorizer_max_features():
Expand Down Expand Up @@ -713,23 +708,24 @@ def test_hashed_binary_occurrences():
assert_equal(X.dtype, np.float64)


def test_vectorizer_inverse_transform():
@pytest.mark.parametrize('Vectorizer', (CountVectorizer, TfidfVectorizer))
def test_vectorizer_inverse_transform(Vectorizer):
# raw documents
data = ALL_FOOD_DOCS
for vectorizer in (TfidfVectorizer(), CountVectorizer()):
transformed_data = vectorizer.fit_transform(data)
inversed_data = vectorizer.inverse_transform(transformed_data)
analyze = vectorizer.build_analyzer()
for doc, inversed_terms in zip(data, inversed_data):
terms = np.sort(np.unique(analyze(doc)))
inversed_terms = np.sort(np.unique(inversed_terms))
assert_array_equal(terms, inversed_terms)

# Test that inverse_transform also works with numpy arrays
transformed_data = transformed_data.toarray()
inversed_data2 = vectorizer.inverse_transform(transformed_data)
for terms, terms2 in zip(inversed_data, inversed_data2):
assert_array_equal(np.sort(terms), np.sort(terms2))
vectorizer = Vectorizer()
transformed_data = vectorizer.fit_transform(data)
inversed_data = vectorizer.inverse_transform(transformed_data)
analyze = vectorizer.build_analyzer()
for doc, inversed_terms in zip(data, inversed_data):
terms = np.sort(np.unique(analyze(doc)))
inversed_terms = np.sort(np.unique(inversed_terms))
assert_array_equal(terms, inversed_terms)

# Test that inverse_transform also works with numpy arrays
transformed_data = transformed_data.toarray()
inversed_data2 = vectorizer.inverse_transform(transformed_data)
for terms, terms2 in zip(inversed_data, inversed_data2):
assert_array_equal(np.sort(terms), np.sort(terms2))


def test_count_vectorizer_pipeline_grid_selection():
Expand Down Expand Up @@ -1030,16 +1026,16 @@ def test_vectorizer_vocab_clone():
assert_equal(vect_vocab_clone.vocabulary_, vect_vocab.vocabulary_)


def test_vectorizer_string_object_as_input():
@pytest.mark.parametrize('Vectorizer',
(CountVectorizer, TfidfVectorizer, HashingVectorizer))
def test_vectorizer_string_object_as_input(Vectorizer):
message = ("Iterable over raw text documents expected, "
"string object received.")
for vec in [CountVectorizer(), TfidfVectorizer(), HashingVectorizer()]:
assert_raise_message(
vec = Vectorizer()
assert_raise_message(
ValueError, message, vec.fit_transform, "hello world!")
assert_raise_message(
ValueError, message, vec.fit, "hello world!")
assert_raise_message(
ValueError, message, vec.transform, "hello world!")
assert_raise_message(ValueError, message, vec.fit, "hello world!")
assert_raise_message(ValueError, message, vec.transform, "hello world!")


@pytest.mark.parametrize("vec", [
Expand Down
5 changes: 0 additions & 5 deletions sklearn/feature_selection/tests/test_feature_select.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
import numpy as np
from scipy import stats, sparse

from numpy.testing import run_module_suite
from sklearn.utils.testing import assert_equal
from sklearn.utils.testing import assert_almost_equal
from sklearn.utils.testing import assert_raises
Expand Down Expand Up @@ -670,7 +669,3 @@ def test_mutual_info_regression():
gtruth = np.zeros(10)
gtruth[:2] = 1
assert_array_equal(support, gtruth)


if __name__ == '__main__':
run_module_suite()
5 changes: 0 additions & 5 deletions sklearn/feature_selection/tests/test_mutual_info.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from __future__ import division

import numpy as np
from numpy.testing import run_module_suite
from scipy.sparse import csr_matrix

from sklearn.utils import check_random_state
Expand Down Expand Up @@ -200,7 +199,3 @@ def test_mutual_info_options():
assert_array_equal(mi_3, mi_4)

assert_false(np.allclose(mi_1, mi_3))


if __name__ == '__main__':
run_module_suite()
17 changes: 10 additions & 7 deletions sklearn/gaussian_process/tests/test_gaussian_process.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@

import numpy as np

import pytest

from sklearn.gaussian_process import GaussianProcess
from sklearn.gaussian_process import regression_models as regression
from sklearn.gaussian_process import correlation_models as correlation
Expand Down Expand Up @@ -100,16 +102,17 @@ def test_wrong_number_of_outputs():
assert_raises(ValueError, gp.fit, [[1, 2, 3], [4, 5, 6]], [1, 2, 3])


def test_more_builtin_correlation_models(random_start=1):
@pytest.mark.parametrize(
'corr',
['absolute_exponential', 'squared_exponential', 'cubic', 'linear'])
def test_more_builtin_correlation_models(corr):
# Repeat test_1d and test_2d for several built-in correlation
# models specified as strings.
all_corr = ['absolute_exponential', 'squared_exponential', 'cubic',
'linear']
random_start = 1

for corr in all_corr:
test_1d(regr='constant', corr=corr, random_start=random_start)
test_2d(regr='constant', corr=corr, random_start=random_start)
test_2d_2d(regr='constant', corr=corr, random_start=random_start)
test_1d(regr='constant', corr=corr, random_start=random_start)
test_2d(regr='constant', corr=corr, random_start=random_start)
test_2d_2d(regr='constant', corr=corr, random_start=random_start)


def test_ordinary_kriging():
Expand Down
Loading