Updated test_text.py (aimacode#349)

antmarakis · norvig · commit 8e0bfd34cb5a · 2017-03-18T01:16:54.000-07:00
* Rearranged Tests

- test_ngram_models to the top
- added test_viterbi-segmentation
- removed test_unigram_text_model

* "test_ngram_models" to "test_text_models"
diff --git a/tests/test_text.py b/tests/test_text.py
@@ -6,13 +6,55 @@
 from utils import isclose, DataFile
 
 
-def test_unigram_text_model():
+def test_text_models():
     flatland = DataFile("EN-text/flatland.txt").read()
     wordseq = words(flatland)
-    P = UnigramTextModel(wordseq)
+    P1 = UnigramTextModel(wordseq)
+    P2 = NgramTextModel(2, wordseq)
+    P3 = NgramTextModel(3, wordseq)
+
+    # The most frequent entries in each model
+    assert P1.top(10) == [(2081, 'the'), (1479, 'of'), (1021, 'and'),
+                          (1008, 'to'), (850, 'a'), (722, 'i'), (640, 'in'),
+                          (478, 'that'), (399, 'is'), (348, 'you')]
+
+    assert P2.top(10) == [(368, ('of', 'the')), (152, ('to', 'the')),
+                          (152, ('in', 'the')), (86, ('of', 'a')),
+                          (80, ('it', 'is')),
+                          (71, ('by', 'the')), (68, ('for', 'the')),
+                          (68, ('and', 'the')), (62, ('on', 'the')),
+                          (60, ('to', 'be'))]
+
+    assert P3.top(10) == [(30, ('a', 'straight', 'line')),
+                          (19, ('of', 'three', 'dimensions')),
+                          (16, ('the', 'sense', 'of')),
+                          (13, ('by', 'the', 'sense')),
+                          (13, ('as', 'well', 'as')),
+                          (12, ('of', 'the', 'circles')),
+                          (12, ('of', 'sight', 'recognition')),
+                          (11, ('the', 'number', 'of')),
+                          (11, ('that', 'i', 'had')), (11, ('so', 'as', 'to'))]
 
-    s, p = viterbi_segment('itiseasytoreadwordswithoutspaces', P)
+    assert isclose(P1['the'], 0.0611, rel_tol=0.001)
+
+    assert isclose(P2['of', 'the'], 0.0108, rel_tol=0.01)
+
+    assert isclose(P3['', '', 'but'], 0.0, rel_tol=0.001)
+    assert isclose(P3['', '', 'but'], 0.0, rel_tol=0.001)
+    assert isclose(P3['so', 'as', 'to'], 0.000323, rel_tol=0.001)
+
+    assert P2.cond_prob.get(('went',)) is None
+
+    assert P3.cond_prob['in', 'order'].dictionary == {'to': 6}
+
+
+def test_viterbi_segmentation():
+    flatland = DataFile("EN-text/flatland.txt").read()
+    wordseq = words(flatland)
+    P = UnigramTextModel(wordseq)
+    text = "itiseasytoreadwordswithoutspaces"
 
+    s, p = viterbi_segment(text,P)
     assert s == [
         'it', 'is', 'easy', 'to', 'read', 'words', 'without', 'spaces']
 
@@ -56,48 +98,6 @@ def test_counting_probability_distribution():
     assert 1 / 7 <= min(ps) <= max(ps) <= 1 / 5
 
 
-def test_ngram_models():
-    flatland = DataFile("EN-text/flatland.txt").read()
-    wordseq = words(flatland)
-    P1 = UnigramTextModel(wordseq)
-    P2 = NgramTextModel(2, wordseq)
-    P3 = NgramTextModel(3, wordseq)
-
-    # The most frequent entries in each model
-    assert P1.top(10) == [(2081, 'the'), (1479, 'of'), (1021, 'and'),
-                          (1008, 'to'), (850, 'a'), (722, 'i'), (640, 'in'),
-                          (478, 'that'), (399, 'is'), (348, 'you')]
-
-    assert P2.top(10) == [(368, ('of', 'the')), (152, ('to', 'the')),
-                          (152, ('in', 'the')), (86, ('of', 'a')),
-                          (80, ('it', 'is')),
-                          (71, ('by', 'the')), (68, ('for', 'the')),
-                          (68, ('and', 'the')), (62, ('on', 'the')),
-                          (60, ('to', 'be'))]
-
-    assert P3.top(10) == [(30, ('a', 'straight', 'line')),
-                          (19, ('of', 'three', 'dimensions')),
-                          (16, ('the', 'sense', 'of')),
-                          (13, ('by', 'the', 'sense')),
-                          (13, ('as', 'well', 'as')),
-                          (12, ('of', 'the', 'circles')),
-                          (12, ('of', 'sight', 'recognition')),
-                          (11, ('the', 'number', 'of')),
-                          (11, ('that', 'i', 'had')), (11, ('so', 'as', 'to'))]
-
-    assert isclose(P1['the'], 0.0611, rel_tol=0.001)
-
-    assert isclose(P2['of', 'the'], 0.0108, rel_tol=0.01)
-
-    assert isclose(P3['', '', 'but'], 0.0, rel_tol=0.001)
-    assert isclose(P3['', '', 'but'], 0.0, rel_tol=0.001)
-    assert isclose(P3['so', 'as', 'to'], 0.000323, rel_tol=0.001)
-
-    assert P2.cond_prob.get(('went',)) is None
-
-    assert P3.cond_prob['in', 'order'].dictionary == {'to': 6}
-
-
 def test_ir_system():
     from collections import namedtuple
     Results = namedtuple('IRResults', ['score', 'url'])