9
9
10
10
from utils import *
11
11
from math import log , exp
12
- import re , probability , string , search
12
+ import heapq , re , search
13
13
14
- class CountingProbDist ( probability . ProbDist ) :
14
+ class CountingProbDist :
15
15
"""A probability distribution formed by observing and counting examples.
16
- If P is an instance of this class and o
17
- is an observed value, then there are 3 main operations:
16
+ If p is an instance of this class and o is an observed value, then
17
+ there are 3 main operations:
18
18
p.add(o) increments the count for observation o by 1.
19
19
p.sample() returns a random element from the distribution.
20
20
p[o] returns the probability for o (as in a regular ProbDist)."""
@@ -23,49 +23,40 @@ def __init__(self, observations=[], default=0):
23
23
"""Create a distribution, and optionally add in some observations.
24
24
By default this is an unsmoothed distribution, but saying default=1,
25
25
for example, gives you add-one smoothing."""
26
- update (self , dictionary = DefaultDict (default ), needs_recompute = False ,
27
- table = [], n_obs = 0 )
26
+ update (self , dictionary = {}, n_obs = 0.0 , default = default , sampler = None )
28
27
for o in observations :
29
28
self .add (o )
30
29
31
30
def add (self , o ):
32
- """Add an observation o to the distribution."""
31
+ "Add an observation o to the distribution."
32
+ self .smooth_for (o )
33
33
self .dictionary [o ] += 1
34
34
self .n_obs += 1
35
- self .needs_recompute = True
35
+ self .sampler = None
36
36
37
- def sample (self ):
38
- """Return a random sample from the distribution."""
39
- if self .needs_recompute : self ._recompute ()
40
- if self .n_obs == 0 :
41
- return None
42
- i = bisect .bisect_left (self .table , (1 + random .randrange (self .n_obs ),))
43
- (count , o ) = self .table [i ]
44
- return o
37
+ def smooth_for (self , o ):
38
+ """Include o among the possible observations, whether or not
39
+ it's been observed yet."""
40
+ if o not in self .dictionary :
41
+ self .dictionary [o ] = self .default
42
+ self .n_obs += self .default
43
+ self .sampler = None
45
44
46
45
def __getitem__ (self , item ):
47
- """ Return an estimate of the probability of item."" "
48
- if self .needs_recompute : self . _recompute ( )
46
+ "Return an estimate of the probability of item."
47
+ self .smooth_for ( item )
49
48
return self .dictionary [item ] / self .n_obs
50
49
51
- def __len__ (self ):
52
- if self .needs_recompute : self ._recompute ()
53
- return self .n_obs
54
-
55
50
def top (self , n ):
56
51
"Return (count, obs) tuples for the n most frequent observations."
57
- items = [(v , k ) for (k , v ) in self .dictionary .items ()]
58
- items .sort (); items .reverse ()
59
- return items [0 :n ]
60
-
61
- def _recompute (self ):
62
- """Recompute the total count n_obs and the table of entries."""
63
- n_obs = 0
64
- table = []
65
- for (o , count ) in self .dictionary .items ():
66
- n_obs += count
67
- table .append ((n_obs , o ))
68
- update (self , n_obs = float (n_obs ), table = table , needs_recompute = False )
52
+ return heapq .nlargest (n , [(v , k ) for (k , v ) in self .dictionary .items ()])
53
+
54
+ def sample (self ):
55
+ "Return a random sample from the distribution."
56
+ if self .sampler is None :
57
+ self .sampler = weighted_sampler (self .dictionary .keys (),
58
+ self .dictionary .values ())
59
+ return self .sampler ()
69
60
70
61
#______________________________________________________________________________
71
62
@@ -81,7 +72,7 @@ def samples(self, n):
81
72
class NgramTextModel (CountingProbDist ):
82
73
"""This is a discrete probability distribution over n-tuples of words.
83
74
You can add, sample or get P[(word1, ..., wordn)]. The method P.samples(n)
84
- builds up an n-word sequence; P.add_text and P.add_sequence add data."""
75
+ builds up an n-word sequence; P.add and P.add_sequence add data."""
85
76
86
77
def __init__ (self , n , observation_sequence = []):
87
78
## In addition to the dictionary of n-tuples, cond_prob is a
@@ -91,7 +82,7 @@ def __init__(self, n, observation_sequence=[]):
91
82
self .cond_prob = DefaultDict (CountingProbDist ())
92
83
self .add_sequence (observation_sequence )
93
84
94
- ## sample, __len__, __getitem__ inherited from CountingProbDist
85
+ ## sample, __getitem__ inherited from CountingProbDist
95
86
## Note they deal with tuples, not strings, as inputs
96
87
97
88
def add (self , ngram ):
@@ -113,13 +104,12 @@ def samples(self, nwords):
113
104
n = self .n
114
105
nminus1gram = ('' ,) * (n - 1 )
115
106
output = []
116
- while len (output ) < nwords :
107
+ for i in range (nwords ):
108
+ if nminus1gram not in self .cond_prob :
109
+ nminus1gram = ('' ,) * (n - 1 ) # Cannot continue, so restart.
117
110
wn = self .cond_prob [nminus1gram ].sample ()
118
- if wn :
119
- output .append (wn )
120
- nminus1gram = nminus1gram [1 :] + (wn ,)
121
- else : ## Cannot continue, so restart.
122
- nminus1gram = ('' ,) * (n - 1 )
111
+ output .append (wn )
112
+ nminus1gram = nminus1gram [1 :] + (wn ,)
123
113
return ' ' .join (output )
124
114
125
115
#______________________________________________________________________________
@@ -404,24 +394,14 @@ def goal_test(self, state):
404
394
True
405
395
"""
406
396
407
- __doc__ += random_tests ("""
397
+ __doc__ += ("""
408
398
## Compare 1-, 2-, and 3-gram word models of the same text.
409
399
>>> flatland = DataFile("EN-text/flatland.txt").read()
410
400
>>> wordseq = words(flatland)
411
401
>>> P1 = UnigramTextModel(wordseq)
412
402
>>> P2 = NgramTextModel(2, wordseq)
413
403
>>> P3 = NgramTextModel(3, wordseq)
414
404
415
- ## Generate random text from the N-gram models
416
- >>> P1.samples(20)
417
- 'you thought known but were insides of see in depend by us dodecahedrons just but i words are instead degrees'
418
-
419
- >>> P2.samples(20)
420
- 'flatland well then can anything else more into the total destruction and circles teach others confine women must be added'
421
-
422
- >>> P3.samples(20)
423
- 'flatland by edwin a abbott 1884 to the wake of a certificate from nature herself proving the equal sided triangle'
424
-
425
405
## The most frequent entries in each model
426
406
>>> P1.top(10)
427
407
[(2081, 'the'), (1479, 'of'), (1021, 'and'), (1008, 'to'), (850, 'a'), (722, 'i'), (640, 'in'), (478, 'that'), (399, 'is'), (348, 'you')]
@@ -431,6 +411,18 @@ def goal_test(self, state):
431
411
432
412
>>> P3.top(10)
433
413
[(30, ('a', 'straight', 'line')), (19, ('of', 'three', 'dimensions')), (16, ('the', 'sense', 'of')), (13, ('by', 'the', 'sense')), (13, ('as', 'well', 'as')), (12, ('of', 'the', 'circles')), (12, ('of', 'sight', 'recognition')), (11, ('the', 'number', 'of')), (11, ('that', 'i', 'had')), (11, ('so', 'as', 'to'))]
414
+ """ )
415
+
416
+ __doc__ += random_tests ("""
417
+ ## Generate random text from the N-gram models
418
+ >>> P1.samples(20)
419
+ 'you thought known but were insides of see in depend by us dodecahedrons just but i words are instead degrees'
420
+
421
+ >>> P2.samples(20)
422
+ 'flatland well then can anything else more into the total destruction and circles teach others confine women must be added'
423
+
424
+ >>> P3.samples(20)
425
+ 'flatland by edwin a abbott 1884 to the wake of a certificate from nature herself proving the equal sided triangle'
434
426
435
427
## Probabilities of some common n-grams
436
428
>>> P1['the']
0 commit comments