|
6 | 6 | from utils import isclose, DataFile
|
7 | 7 |
|
8 | 8 |
|
9 |
| -def test_unigram_text_model(): |
| 9 | +def test_text_models(): |
10 | 10 | flatland = DataFile("EN-text/flatland.txt").read()
|
11 | 11 | wordseq = words(flatland)
|
12 |
| - P = UnigramTextModel(wordseq) |
| 12 | + P1 = UnigramTextModel(wordseq) |
| 13 | + P2 = NgramTextModel(2, wordseq) |
| 14 | + P3 = NgramTextModel(3, wordseq) |
| 15 | + |
| 16 | + # The most frequent entries in each model |
| 17 | + assert P1.top(10) == [(2081, 'the'), (1479, 'of'), (1021, 'and'), |
| 18 | + (1008, 'to'), (850, 'a'), (722, 'i'), (640, 'in'), |
| 19 | + (478, 'that'), (399, 'is'), (348, 'you')] |
| 20 | + |
| 21 | + assert P2.top(10) == [(368, ('of', 'the')), (152, ('to', 'the')), |
| 22 | + (152, ('in', 'the')), (86, ('of', 'a')), |
| 23 | + (80, ('it', 'is')), |
| 24 | + (71, ('by', 'the')), (68, ('for', 'the')), |
| 25 | + (68, ('and', 'the')), (62, ('on', 'the')), |
| 26 | + (60, ('to', 'be'))] |
| 27 | + |
| 28 | + assert P3.top(10) == [(30, ('a', 'straight', 'line')), |
| 29 | + (19, ('of', 'three', 'dimensions')), |
| 30 | + (16, ('the', 'sense', 'of')), |
| 31 | + (13, ('by', 'the', 'sense')), |
| 32 | + (13, ('as', 'well', 'as')), |
| 33 | + (12, ('of', 'the', 'circles')), |
| 34 | + (12, ('of', 'sight', 'recognition')), |
| 35 | + (11, ('the', 'number', 'of')), |
| 36 | + (11, ('that', 'i', 'had')), (11, ('so', 'as', 'to'))] |
13 | 37 |
|
14 |
| - s, p = viterbi_segment('itiseasytoreadwordswithoutspaces', P) |
| 38 | + assert isclose(P1['the'], 0.0611, rel_tol=0.001) |
| 39 | + |
| 40 | + assert isclose(P2['of', 'the'], 0.0108, rel_tol=0.01) |
| 41 | + |
| 42 | + assert isclose(P3['', '', 'but'], 0.0, rel_tol=0.001) |
| 43 | + assert isclose(P3['', '', 'but'], 0.0, rel_tol=0.001) |
| 44 | + assert isclose(P3['so', 'as', 'to'], 0.000323, rel_tol=0.001) |
| 45 | + |
| 46 | + assert P2.cond_prob.get(('went',)) is None |
| 47 | + |
| 48 | + assert P3.cond_prob['in', 'order'].dictionary == {'to': 6} |
| 49 | + |
| 50 | + |
| 51 | +def test_viterbi_segmentation(): |
| 52 | + flatland = DataFile("EN-text/flatland.txt").read() |
| 53 | + wordseq = words(flatland) |
| 54 | + P = UnigramTextModel(wordseq) |
| 55 | + text = "itiseasytoreadwordswithoutspaces" |
15 | 56 |
|
| 57 | + s, p = viterbi_segment(text,P) |
16 | 58 | assert s == [
|
17 | 59 | 'it', 'is', 'easy', 'to', 'read', 'words', 'without', 'spaces']
|
18 | 60 |
|
@@ -56,48 +98,6 @@ def test_counting_probability_distribution():
|
56 | 98 | assert 1 / 7 <= min(ps) <= max(ps) <= 1 / 5
|
57 | 99 |
|
58 | 100 |
|
59 |
| -def test_ngram_models(): |
60 |
| - flatland = DataFile("EN-text/flatland.txt").read() |
61 |
| - wordseq = words(flatland) |
62 |
| - P1 = UnigramTextModel(wordseq) |
63 |
| - P2 = NgramTextModel(2, wordseq) |
64 |
| - P3 = NgramTextModel(3, wordseq) |
65 |
| - |
66 |
| - # The most frequent entries in each model |
67 |
| - assert P1.top(10) == [(2081, 'the'), (1479, 'of'), (1021, 'and'), |
68 |
| - (1008, 'to'), (850, 'a'), (722, 'i'), (640, 'in'), |
69 |
| - (478, 'that'), (399, 'is'), (348, 'you')] |
70 |
| - |
71 |
| - assert P2.top(10) == [(368, ('of', 'the')), (152, ('to', 'the')), |
72 |
| - (152, ('in', 'the')), (86, ('of', 'a')), |
73 |
| - (80, ('it', 'is')), |
74 |
| - (71, ('by', 'the')), (68, ('for', 'the')), |
75 |
| - (68, ('and', 'the')), (62, ('on', 'the')), |
76 |
| - (60, ('to', 'be'))] |
77 |
| - |
78 |
| - assert P3.top(10) == [(30, ('a', 'straight', 'line')), |
79 |
| - (19, ('of', 'three', 'dimensions')), |
80 |
| - (16, ('the', 'sense', 'of')), |
81 |
| - (13, ('by', 'the', 'sense')), |
82 |
| - (13, ('as', 'well', 'as')), |
83 |
| - (12, ('of', 'the', 'circles')), |
84 |
| - (12, ('of', 'sight', 'recognition')), |
85 |
| - (11, ('the', 'number', 'of')), |
86 |
| - (11, ('that', 'i', 'had')), (11, ('so', 'as', 'to'))] |
87 |
| - |
88 |
| - assert isclose(P1['the'], 0.0611, rel_tol=0.001) |
89 |
| - |
90 |
| - assert isclose(P2['of', 'the'], 0.0108, rel_tol=0.01) |
91 |
| - |
92 |
| - assert isclose(P3['', '', 'but'], 0.0, rel_tol=0.001) |
93 |
| - assert isclose(P3['', '', 'but'], 0.0, rel_tol=0.001) |
94 |
| - assert isclose(P3['so', 'as', 'to'], 0.000323, rel_tol=0.001) |
95 |
| - |
96 |
| - assert P2.cond_prob.get(('went',)) is None |
97 |
| - |
98 |
| - assert P3.cond_prob['in', 'order'].dictionary == {'to': 6} |
99 |
| - |
100 |
| - |
101 | 101 | def test_ir_system():
|
102 | 102 | from collections import namedtuple
|
103 | 103 | Results = namedtuple('IRResults', ['score', 'url'])
|
|
0 commit comments