From 69e8b6f53ef6ff0bd17fc13edd613dae48d681e7 Mon Sep 17 00:00:00 2001
From: Antonis Maronikolakis <antmarakis@programmers.gr>
Date: Sat, 11 Mar 2017 13:10:41 +0200
Subject: [PATCH 1/2] Rearranged Tests

- test_ngram_models to the top
- added test_viterbi-segmentation
- removed test_unigram_text_model
---
 tests/test_text.py | 90 +++++++++++++++++++++++-----------------------
 1 file changed, 45 insertions(+), 45 deletions(-)

diff --git a/tests/test_text.py b/tests/test_text.py
index 0cd3e675c..2391820e3 100644
--- a/tests/test_text.py
+++ b/tests/test_text.py
@@ -6,13 +6,55 @@
 from utils import isclose, DataFile
 
 
-def test_unigram_text_model():
+def test_ngram_models():
     flatland = DataFile("EN-text/flatland.txt").read()
     wordseq = words(flatland)
-    P = UnigramTextModel(wordseq)
+    P1 = UnigramTextModel(wordseq)
+    P2 = NgramTextModel(2, wordseq)
+    P3 = NgramTextModel(3, wordseq)
+
+    # The most frequent entries in each model
+    assert P1.top(10) == [(2081, 'the'), (1479, 'of'), (1021, 'and'),
+                          (1008, 'to'), (850, 'a'), (722, 'i'), (640, 'in'),
+                          (478, 'that'), (399, 'is'), (348, 'you')]
+
+    assert P2.top(10) == [(368, ('of', 'the')), (152, ('to', 'the')),
+                          (152, ('in', 'the')), (86, ('of', 'a')),
+                          (80, ('it', 'is')),
+                          (71, ('by', 'the')), (68, ('for', 'the')),
+                          (68, ('and', 'the')), (62, ('on', 'the')),
+                          (60, ('to', 'be'))]
 
-    s, p = viterbi_segment('itiseasytoreadwordswithoutspaces', P)
+    assert P3.top(10) == [(30, ('a', 'straight', 'line')),
+                          (19, ('of', 'three', 'dimensions')),
+                          (16, ('the', 'sense', 'of')),
+                          (13, ('by', 'the', 'sense')),
+                          (13, ('as', 'well', 'as')),
+                          (12, ('of', 'the', 'circles')),
+                          (12, ('of', 'sight', 'recognition')),
+                          (11, ('the', 'number', 'of')),
+                          (11, ('that', 'i', 'had')), (11, ('so', 'as', 'to'))]
+
+    assert isclose(P1['the'], 0.0611, rel_tol=0.001)
+
+    assert isclose(P2['of', 'the'], 0.0108, rel_tol=0.01)
+
+    assert isclose(P3['', '', 'but'], 0.0, rel_tol=0.001)
+    assert isclose(P3['', '', 'but'], 0.0, rel_tol=0.001)
+    assert isclose(P3['so', 'as', 'to'], 0.000323, rel_tol=0.001)
+
+    assert P2.cond_prob.get(('went',)) is None
+
+    assert P3.cond_prob['in', 'order'].dictionary == {'to': 6}
+
+
+def test_viterbi_segmentation():
+    flatland = DataFile("EN-text/flatland.txt").read()
+    wordseq = words(flatland)
+    P = UnigramTextModel(wordseq)
+    text = "itiseasytoreadwordswithoutspaces"
 
+    s, p = viterbi_segment(text,P)
     assert s == [
         'it', 'is', 'easy', 'to', 'read', 'words', 'without', 'spaces']
 
@@ -56,48 +98,6 @@ def test_counting_probability_distribution():
     assert 1 / 7 <= min(ps) <= max(ps) <= 1 / 5
 
 
-def test_ngram_models():
-    flatland = DataFile("EN-text/flatland.txt").read()
-    wordseq = words(flatland)
-    P1 = UnigramTextModel(wordseq)
-    P2 = NgramTextModel(2, wordseq)
-    P3 = NgramTextModel(3, wordseq)
-
-    # The most frequent entries in each model
-    assert P1.top(10) == [(2081, 'the'), (1479, 'of'), (1021, 'and'),
-                          (1008, 'to'), (850, 'a'), (722, 'i'), (640, 'in'),
-                          (478, 'that'), (399, 'is'), (348, 'you')]
-
-    assert P2.top(10) == [(368, ('of', 'the')), (152, ('to', 'the')),
-                          (152, ('in', 'the')), (86, ('of', 'a')),
-                          (80, ('it', 'is')),
-                          (71, ('by', 'the')), (68, ('for', 'the')),
-                          (68, ('and', 'the')), (62, ('on', 'the')),
-                          (60, ('to', 'be'))]
-
-    assert P3.top(10) == [(30, ('a', 'straight', 'line')),
-                          (19, ('of', 'three', 'dimensions')),
-                          (16, ('the', 'sense', 'of')),
-                          (13, ('by', 'the', 'sense')),
-                          (13, ('as', 'well', 'as')),
-                          (12, ('of', 'the', 'circles')),
-                          (12, ('of', 'sight', 'recognition')),
-                          (11, ('the', 'number', 'of')),
-                          (11, ('that', 'i', 'had')), (11, ('so', 'as', 'to'))]
-
-    assert isclose(P1['the'], 0.0611, rel_tol=0.001)
-
-    assert isclose(P2['of', 'the'], 0.0108, rel_tol=0.01)
-
-    assert isclose(P3['', '', 'but'], 0.0, rel_tol=0.001)
-    assert isclose(P3['', '', 'but'], 0.0, rel_tol=0.001)
-    assert isclose(P3['so', 'as', 'to'], 0.000323, rel_tol=0.001)
-
-    assert P2.cond_prob.get(('went',)) is None
-
-    assert P3.cond_prob['in', 'order'].dictionary == {'to': 6}
-
-
 def test_ir_system():
     from collections import namedtuple
     Results = namedtuple('IRResults', ['score', 'url'])

From ebf6fb537b9ebe16dd6d322abf018ee512051677 Mon Sep 17 00:00:00 2001
From: Antonis Maronikolakis <antmarakis@programmers.gr>
Date: Sat, 11 Mar 2017 13:39:44 +0200
Subject: [PATCH 2/2] "test_ngram_models" to "test_text_models"

---
 tests/test_text.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_text.py b/tests/test_text.py
index 2391820e3..d58cd497a 100644
--- a/tests/test_text.py
+++ b/tests/test_text.py
@@ -6,7 +6,7 @@
 from utils import isclose, DataFile
 
 
-def test_ngram_models():
+def test_text_models():
     flatland = DataFile("EN-text/flatland.txt").read()
     wordseq = words(flatland)
     P1 = UnigramTextModel(wordseq)