From 69e8b6f53ef6ff0bd17fc13edd613dae48d681e7 Mon Sep 17 00:00:00 2001 From: Antonis Maronikolakis Date: Sat, 11 Mar 2017 13:10:41 +0200 Subject: [PATCH 1/2] Rearranged Tests - test_ngram_models to the top - added test_viterbi-segmentation - removed test_unigram_text_model --- tests/test_text.py | 90 +++++++++++++++++++++++----------------------- 1 file changed, 45 insertions(+), 45 deletions(-) diff --git a/tests/test_text.py b/tests/test_text.py index 0cd3e675c..2391820e3 100644 --- a/tests/test_text.py +++ b/tests/test_text.py @@ -6,13 +6,55 @@ from utils import isclose, DataFile -def test_unigram_text_model(): +def test_ngram_models(): flatland = DataFile("EN-text/flatland.txt").read() wordseq = words(flatland) - P = UnigramTextModel(wordseq) + P1 = UnigramTextModel(wordseq) + P2 = NgramTextModel(2, wordseq) + P3 = NgramTextModel(3, wordseq) + + # The most frequent entries in each model + assert P1.top(10) == [(2081, 'the'), (1479, 'of'), (1021, 'and'), + (1008, 'to'), (850, 'a'), (722, 'i'), (640, 'in'), + (478, 'that'), (399, 'is'), (348, 'you')] + + assert P2.top(10) == [(368, ('of', 'the')), (152, ('to', 'the')), + (152, ('in', 'the')), (86, ('of', 'a')), + (80, ('it', 'is')), + (71, ('by', 'the')), (68, ('for', 'the')), + (68, ('and', 'the')), (62, ('on', 'the')), + (60, ('to', 'be'))] - s, p = viterbi_segment('itiseasytoreadwordswithoutspaces', P) + assert P3.top(10) == [(30, ('a', 'straight', 'line')), + (19, ('of', 'three', 'dimensions')), + (16, ('the', 'sense', 'of')), + (13, ('by', 'the', 'sense')), + (13, ('as', 'well', 'as')), + (12, ('of', 'the', 'circles')), + (12, ('of', 'sight', 'recognition')), + (11, ('the', 'number', 'of')), + (11, ('that', 'i', 'had')), (11, ('so', 'as', 'to'))] + + assert isclose(P1['the'], 0.0611, rel_tol=0.001) + + assert isclose(P2['of', 'the'], 0.0108, rel_tol=0.01) + + assert isclose(P3['', '', 'but'], 0.0, rel_tol=0.001) + assert isclose(P3['', '', 'but'], 0.0, rel_tol=0.001) + assert isclose(P3['so', 'as', 'to'], 0.000323, rel_tol=0.001) + + assert P2.cond_prob.get(('went',)) is None + + assert P3.cond_prob['in', 'order'].dictionary == {'to': 6} + + +def test_viterbi_segmentation(): + flatland = DataFile("EN-text/flatland.txt").read() + wordseq = words(flatland) + P = UnigramTextModel(wordseq) + text = "itiseasytoreadwordswithoutspaces" + s, p = viterbi_segment(text,P) assert s == [ 'it', 'is', 'easy', 'to', 'read', 'words', 'without', 'spaces'] @@ -56,48 +98,6 @@ def test_counting_probability_distribution(): assert 1 / 7 <= min(ps) <= max(ps) <= 1 / 5 -def test_ngram_models(): - flatland = DataFile("EN-text/flatland.txt").read() - wordseq = words(flatland) - P1 = UnigramTextModel(wordseq) - P2 = NgramTextModel(2, wordseq) - P3 = NgramTextModel(3, wordseq) - - # The most frequent entries in each model - assert P1.top(10) == [(2081, 'the'), (1479, 'of'), (1021, 'and'), - (1008, 'to'), (850, 'a'), (722, 'i'), (640, 'in'), - (478, 'that'), (399, 'is'), (348, 'you')] - - assert P2.top(10) == [(368, ('of', 'the')), (152, ('to', 'the')), - (152, ('in', 'the')), (86, ('of', 'a')), - (80, ('it', 'is')), - (71, ('by', 'the')), (68, ('for', 'the')), - (68, ('and', 'the')), (62, ('on', 'the')), - (60, ('to', 'be'))] - - assert P3.top(10) == [(30, ('a', 'straight', 'line')), - (19, ('of', 'three', 'dimensions')), - (16, ('the', 'sense', 'of')), - (13, ('by', 'the', 'sense')), - (13, ('as', 'well', 'as')), - (12, ('of', 'the', 'circles')), - (12, ('of', 'sight', 'recognition')), - (11, ('the', 'number', 'of')), - (11, ('that', 'i', 'had')), (11, ('so', 'as', 'to'))] - - assert isclose(P1['the'], 0.0611, rel_tol=0.001) - - assert isclose(P2['of', 'the'], 0.0108, rel_tol=0.01) - - assert isclose(P3['', '', 'but'], 0.0, rel_tol=0.001) - assert isclose(P3['', '', 'but'], 0.0, rel_tol=0.001) - assert isclose(P3['so', 'as', 'to'], 0.000323, rel_tol=0.001) - - assert P2.cond_prob.get(('went',)) is None - - assert P3.cond_prob['in', 'order'].dictionary == {'to': 6} - - def test_ir_system(): from collections import namedtuple Results = namedtuple('IRResults', ['score', 'url']) From ebf6fb537b9ebe16dd6d322abf018ee512051677 Mon Sep 17 00:00:00 2001 From: Antonis Maronikolakis Date: Sat, 11 Mar 2017 13:39:44 +0200 Subject: [PATCH 2/2] "test_ngram_models" to "test_text_models" --- tests/test_text.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_text.py b/tests/test_text.py index 2391820e3..d58cd497a 100644 --- a/tests/test_text.py +++ b/tests/test_text.py @@ -6,7 +6,7 @@ from utils import isclose, DataFile -def test_ngram_models(): +def test_text_models(): flatland = DataFile("EN-text/flatland.txt").read() wordseq = words(flatland) P1 = UnigramTextModel(wordseq)