@@ -116,6 +116,7 @@ def __init__(self, name, rules, lexicon):
116
116
self .rules = rules
117
117
self .lexicon = lexicon
118
118
self .categories = defaultdict (list )
119
+
119
120
for lhs in lexicon :
120
121
for word , prob in lexicon [lhs ]:
121
122
self .categories [word ].append ((lhs , prob ))
@@ -128,6 +129,16 @@ def isa(self, word, cat):
128
129
"""Return True iff word is of category cat"""
129
130
return cat in [c for c , _ in self .categories [word ]]
130
131
132
+ def cnf_rules (self ):
133
+ """Returns the tuple (X, Y, Z, p) for rules in the form:
134
+ X -> Y Z [p]"""
135
+ cnf = []
136
+ for X , rules in self .rules .items ():
137
+ for (Y , Z ), p in rules :
138
+ cnf .append ((X , Y , Z , p ))
139
+
140
+ return cnf
141
+
131
142
def generate_random (self , S = 'S' ):
132
143
"""Replace each token in S by a random entry in grammar (recursively).
133
144
Returns a tuple of (sentence, probability)."""
@@ -189,11 +200,48 @@ def __repr__(self):
189
200
V = 'saw | liked | feel'
190
201
))
191
202
192
- E_NP_ = Grammar ('E_NP_' , # another trivial grammar for testing
203
+ E_NP_ = Grammar ('E_NP_' , # Another Trivial Grammar for testing
193
204
Rules (NP = 'Adj NP | N' ),
194
205
Lexicon (Adj = 'happy | handsome | hairy' ,
195
206
N = 'man' ))
196
207
208
+ E_Prob = ProbGrammar ('E_Prob' , # The Probabilistic Grammar from the notebook
209
+ ProbRules (
210
+ S = "NP VP [0.6] | S Conjuction S [0.4]" ,
211
+ NP = "Pronoun [0.2] | Name [0.05] | Noun [0.2] | Article Noun [0.15] \
212
+ | Article Adjs Noun [0.1] | Digit [0.05] | NP PP [0.15] | NP RelClause [0.1]" ,
213
+ VP = "Verb [0.3] | VP NP [0.2] | VP Adjective [0.25] | VP PP [0.15] | VP Adverb [0.1]" ,
214
+ Adjs = "Adjective [0.5] | Adjective Adjs [0.5]" ,
215
+ PP = "Preposition NP [1]" ,
216
+ RelClause = "RelPro VP [1]"
217
+ ),
218
+ ProbLexicon (
219
+ Verb = "is [0.5] | say [0.3] | are [0.2]" ,
220
+ Noun = "robot [0.4] | sheep [0.4] | fence [0.2]" ,
221
+ Adjective = "good [0.5] | new [0.2] | sad [0.3]" ,
222
+ Adverb = "here [0.6] | lightly [0.1] | now [0.3]" ,
223
+ Pronoun = "me [0.3] | you [0.4] | he [0.3]" ,
224
+ RelPro = "that [0.5] | who [0.3] | which [0.2]" ,
225
+ Name = "john [0.4] | mary [0.4] | peter [0.2]" ,
226
+ Article = "the [0.5] | a [0.25] | an [0.25]" ,
227
+ Preposition = "to [0.4] | in [0.3] | at [0.3]" ,
228
+ Conjuction = "and [0.5] | or [0.2] | but [0.3]" ,
229
+ Digit = "0 [0.35] | 1 [0.35] | 2 [0.3]"
230
+ ))
231
+
232
+ E_Prob_Chomsky = ProbGrammar ('E_Prob_Chomsky' , # A Probabilistic Grammar in CNF
233
+ ProbRules (
234
+ S = 'NP VP [1]' ,
235
+ NP = 'Article Noun [0.6] | Adjective Noun [0.4]' ,
236
+ VP = 'Verb NP [0.5] | Verb Adjective [0.5]' ,
237
+ ),
238
+ ProbLexicon (
239
+ Article = 'the [0.5] | a [0.25] | an [0.25]' ,
240
+ Noun = 'robot [0.4] | sheep [0.4] | fence [0.2]' ,
241
+ Adjective = 'good [0.5] | new [0.2] | sad [0.3]' ,
242
+ Verb = 'is [0.5] | say [0.3] | are [0.2]'
243
+ ))
244
+
197
245
198
246
# ______________________________________________________________________________
199
247
# Chart Parsing
@@ -236,7 +284,7 @@ def parse(self, words, S='S'):
236
284
return self .chart
237
285
238
286
def add_edge (self , edge ):
239
- "Add edge to chart, and see if it extends or predicts another edge."
287
+ """ Add edge to chart, and see if it extends or predicts another edge."" "
240
288
start , end , lhs , found , expects = edge
241
289
if edge not in self .chart [end ]:
242
290
self .chart [end ].append (edge )
@@ -248,21 +296,21 @@ def add_edge(self, edge):
248
296
self .predictor (edge )
249
297
250
298
def scanner (self , j , word ):
251
- "For each edge expecting a word of this category here, extend the edge."
299
+ """ For each edge expecting a word of this category here, extend the edge."" "
252
300
for (i , j , A , alpha , Bb ) in self .chart [j ]:
253
301
if Bb and self .grammar .isa (word , Bb [0 ]):
254
302
self .add_edge ([i , j + 1 , A , alpha + [(Bb [0 ], word )], Bb [1 :]])
255
303
256
304
def predictor (self , edge ):
257
- "Add to chart any rules for B that could help extend this edge."
305
+ """ Add to chart any rules for B that could help extend this edge."" "
258
306
(i , j , A , alpha , Bb ) = edge
259
307
B = Bb [0 ]
260
308
if B in self .grammar .rules :
261
309
for rhs in self .grammar .rewrites_for (B ):
262
310
self .add_edge ([j , j , B , [], rhs ])
263
311
264
312
def extender (self , edge ):
265
- "See what edges can be extended by this edge."
313
+ """ See what edges can be extended by this edge."" "
266
314
(j , k , B , _ , _ ) = edge
267
315
for (i , j , A , alpha , B1b ) in self .chart [j ]:
268
316
if B1b and B == B1b [0 ]:
@@ -273,23 +321,26 @@ def extender(self, edge):
273
321
# CYK Parsing
274
322
275
323
def CYK_parse (words , grammar ):
276
- "[Figure 23.5]"
324
+ """ [Figure 23.5] "" "
277
325
# We use 0-based indexing instead of the book's 1-based.
278
326
N = len (words )
279
327
P = defaultdict (float )
328
+
280
329
# Insert lexical rules for each word.
281
330
for (i , word ) in enumerate (words ):
282
- for (X , p ) in grammar .categories [word ]: # XXX grammar.categories needs changing, above
331
+ for (X , p ) in grammar .categories [word ]:
283
332
P [X , i , 1 ] = p
333
+
284
334
# Combine first and second parts of right-hand sides of rules,
285
335
# from short to long.
286
336
for length in range (2 , N + 1 ):
287
337
for start in range (N - length + 1 ):
288
338
for len1 in range (1 , length ): # N.B. the book incorrectly has N instead of length
289
339
len2 = length - len1
290
- for (X , Y , Z , p ) in grammar .cnf_rules (): # XXX grammar needs this method
340
+ for (X , Y , Z , p ) in grammar .cnf_rules ():
291
341
P [X , start , length ] = max (P [X , start , length ],
292
342
P [Y , start , len1 ] * P [Z , start + len1 , len2 ] * p )
343
+
293
344
return P
294
345
295
346
@@ -395,6 +446,7 @@ def relevant_pages(query):
395
446
hit_intersection = hit_intersection .intersection (hit_list )
396
447
return {addr : pagesIndex [addr ] for addr in hit_intersection }
397
448
449
+
398
450
def normalize (pages ):
399
451
"""Normalize divides each page's score by the sum of the squares of all
400
452
pages' scores (separately for both the authority and hub scores).
0 commit comments