From 7899ec4f772f48e3037ee664e4dc0fb07fc49218 Mon Sep 17 00:00:00 2001 From: Lucas Moura Date: Fri, 24 Mar 2017 14:00:57 -0300 Subject: [PATCH 1/2] Fix NgramTextModel bug --- text.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/text.py b/text.py index 855e89aaf..e064b6049 100644 --- a/text.py +++ b/text.py @@ -55,7 +55,7 @@ def add_sequence(self, words): Prefix some copies of the empty word, '', to make the start work.""" n = self.n words = ['', ] * (n - 1) + words - for i in range(len(words) - n): + for i in range(len(words) - n + 1): self.add(tuple(words[i:i + n])) def samples(self, nwords): From 96dbe26c5125e8d35d9c3d4a8e2363f84925dfa0 Mon Sep 17 00:00:00 2001 From: Lucas Moura Date: Fri, 24 Mar 2017 14:01:12 -0300 Subject: [PATCH 2/2] Add new tests for NgramTextModel --- tests/test_text.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/tests/test_text.py b/tests/test_text.py index 577ad661b..d884e02a2 100644 --- a/tests/test_text.py +++ b/tests/test_text.py @@ -47,6 +47,32 @@ def test_text_models(): assert P3.cond_prob['in', 'order'].dictionary == {'to': 6} + test_string = 'unigram' + wordseq = words(test_string) + + P1 = UnigramTextModel(wordseq) + + assert P1.dictionary == {('unigram'): 1} + + test_string = 'bigram text' + wordseq = words(test_string) + + P2 = NgramTextModel(2, wordseq) + + assert (P2.dictionary == {('', 'bigram'): 1, ('bigram', 'text'): 1} or + P2.dictionary == {('bigram', 'text'): 1, ('', 'bigram'): 1}) + + + test_string = 'test trigram text' + wordseq = words(test_string) + + P3 = NgramTextModel(3, wordseq) + + assert ('', '', 'test') in P3.dictionary + assert ('', 'test', 'trigram') in P3.dictionary + assert ('test', 'trigram', 'text') in P3.dictionary + assert len(P3.dictionary) == 3 + def test_viterbi_segmentation(): flatland = DataFile("EN-text/flatland.txt").read()