From ed7c3d6fce3a827d8c78103a89e81e4eee1ef1b8 Mon Sep 17 00:00:00 2001 From: Anthony Marakis Date: Thu, 3 Aug 2017 00:05:35 +0300 Subject: [PATCH 1/2] Update nlp.py --- nlp.py | 68 +++++++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 60 insertions(+), 8 deletions(-) diff --git a/nlp.py b/nlp.py index e9eff8e01..51007a985 100644 --- a/nlp.py +++ b/nlp.py @@ -116,6 +116,7 @@ def __init__(self, name, rules, lexicon): self.rules = rules self.lexicon = lexicon self.categories = defaultdict(list) + for lhs in lexicon: for word, prob in lexicon[lhs]: self.categories[word].append((lhs, prob)) @@ -128,6 +129,16 @@ def isa(self, word, cat): """Return True iff word is of category cat""" return cat in [c for c, _ in self.categories[word]] + def cnf_rules(self): + """Returns the tuple (X, Y, Z, p) for rules in the form: + X -> Y Z [p]""" + cnf = [] + for X, rules in self.rules.items(): + for (Y, Z), p in rules: + cnf.append((X, Y, Z, p)) + + return cnf + def generate_random(self, S='S'): """Replace each token in S by a random entry in grammar (recursively). Returns a tuple of (sentence, probability).""" @@ -189,11 +200,48 @@ def __repr__(self): V='saw | liked | feel' )) -E_NP_ = Grammar('E_NP_', # another trivial grammar for testing +E_NP_ = Grammar('E_NP_', # Another Trivial Grammar for testing Rules(NP='Adj NP | N'), Lexicon(Adj='happy | handsome | hairy', N='man')) +E_Prob = ProbGrammar('E_Prob', # The Probabilistic Grammar from the notebook + ProbRules( + S="NP VP [0.6] | S Conjuction S [0.4]", + NP="Pronoun [0.2] | Name [0.05] | Noun [0.2] | Article Noun [0.15] \ + | Article Adjs Noun [0.1] | Digit [0.05] | NP PP [0.15] | NP RelClause [0.1]", + VP="Verb [0.3] | VP NP [0.2] | VP Adjective [0.25] | VP PP [0.15] | VP Adverb [0.1]", + Adjs="Adjective [0.5] | Adjective Adjs [0.5]", + PP="Preposition NP [1]", + RelClause="RelPro VP [1]" + ), + ProbLexicon( + Verb="is [0.5] | say [0.3] | are [0.2]", + Noun="robot [0.4] | sheep [0.4] | fence [0.2]", + Adjective="good [0.5] | new [0.2] | sad [0.3]", + Adverb="here [0.6] | lightly [0.1] | now [0.3]", + Pronoun="me [0.3] | you [0.4] | he [0.3]", + RelPro="that [0.5] | who [0.3] | which [0.2]", + Name="john [0.4] | mary [0.4] | peter [0.2]", + Article="the [0.5] | a [0.25] | an [0.25]", + Preposition="to [0.4] | in [0.3] | at [0.3]", + Conjuction="and [0.5] | or [0.2] | but [0.3]", + Digit="0 [0.35] | 1 [0.35] | 2 [0.3]" + )) + +E_Prob_Chomsky = ProbGrammar('E_Prob_Chomsky', # A Probabilistic Grammar in CNF + ProbRules( + S='NP VP [1]', + NP='Article Noun [0.6] | Adjective Noun [0.4]', + VP='Verb NP [0.5] | Verb Adjective [0.5]', + ), + ProbLexicon( + Article='the [0.5] | a [0.25] | an [0.25]', + Noun='robot [0.4] | sheep [0.4] | fence [0.2]', + Adjective='good [0.5] | new [0.2] | sad [0.3]', + Verb='is [0.5] | say [0.3] | are [0.2]' + )) + # ______________________________________________________________________________ # Chart Parsing @@ -236,7 +284,7 @@ def parse(self, words, S='S'): return self.chart def add_edge(self, edge): - "Add edge to chart, and see if it extends or predicts another edge." + """Add edge to chart, and see if it extends or predicts another edge.""" start, end, lhs, found, expects = edge if edge not in self.chart[end]: self.chart[end].append(edge) @@ -248,13 +296,13 @@ def add_edge(self, edge): self.predictor(edge) def scanner(self, j, word): - "For each edge expecting a word of this category here, extend the edge." + """For each edge expecting a word of this category here, extend the edge.""" for (i, j, A, alpha, Bb) in self.chart[j]: if Bb and self.grammar.isa(word, Bb[0]): self.add_edge([i, j+1, A, alpha + [(Bb[0], word)], Bb[1:]]) def predictor(self, edge): - "Add to chart any rules for B that could help extend this edge." + """Add to chart any rules for B that could help extend this edge.""" (i, j, A, alpha, Bb) = edge B = Bb[0] if B in self.grammar.rules: @@ -262,7 +310,7 @@ def predictor(self, edge): self.add_edge([j, j, B, [], rhs]) def extender(self, edge): - "See what edges can be extended by this edge." + """See what edges can be extended by this edge.""" (j, k, B, _, _) = edge for (i, j, A, alpha, B1b) in self.chart[j]: if B1b and B == B1b[0]: @@ -273,23 +321,26 @@ def extender(self, edge): # CYK Parsing def CYK_parse(words, grammar): - "[Figure 23.5]" + """ [Figure 23.5] """ # We use 0-based indexing instead of the book's 1-based. N = len(words) P = defaultdict(float) + # Insert lexical rules for each word. for (i, word) in enumerate(words): - for (X, p) in grammar.categories[word]: # XXX grammar.categories needs changing, above + for (X, p) in grammar.categories[word]: P[X, i, 1] = p + # Combine first and second parts of right-hand sides of rules, # from short to long. for length in range(2, N+1): for start in range(N-length+1): for len1 in range(1, length): # N.B. the book incorrectly has N instead of length len2 = length - len1 - for (X, Y, Z, p) in grammar.cnf_rules(): # XXX grammar needs this method + for (X, Y, Z, p) in grammar.cnf_rules(): P[X, start, length] = max(P[X, start, length], P[Y, start, len1] * P[Z, start+len1, len2] * p) + return P @@ -395,6 +446,7 @@ def relevant_pages(query): hit_intersection = hit_intersection.intersection(hit_list) return {addr: pagesIndex[addr] for addr in hit_intersection} + def normalize(pages): """Normalize divides each page's score by the sum of the squares of all pages' scores (separately for both the authority and hub scores). From 7953abe10aa011ef76f77dae26c36f0e837765f7 Mon Sep 17 00:00:00 2001 From: Anthony Marakis Date: Thu, 3 Aug 2017 00:06:20 +0300 Subject: [PATCH 2/2] add CYK parsing test --- tests/test_nlp.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/test_nlp.py b/tests/test_nlp.py index e5ccb1e63..030469f46 100644 --- a/tests/test_nlp.py +++ b/tests/test_nlp.py @@ -5,6 +5,7 @@ from nlp import expand_pages, relevant_pages, normalize, ConvergenceDetector, getInlinks from nlp import getOutlinks, Page, determineInlinks, HITS from nlp import Rules, Lexicon, Grammar, ProbRules, ProbLexicon, ProbGrammar +from nlp import CYK_parse # Clumsy imports because we want to access certain nlp.py globals explicitly, because # they are accessed by functions within nlp.py @@ -92,6 +93,13 @@ def test_prob_generation(): assert len(sentence) == 2 +def test_CYK_parse(): + grammar = nlp.E_Prob_Chomsky + words = ['the', 'robot', 'is', 'good'] + P = CYK_parse(words, grammar) + assert len(P) == 52 + + # ______________________________________________________________________________ # Data Setup