Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 03e6861

Browse files
committed
Move CountingProbDist to learning.py and use it in NaiveBayesLearner. Make NaiveBayesLearner use target-value frequencies too.
1 parent 8e39013 commit 03e6861

File tree

2 files changed

+71
-87
lines changed

2 files changed

+71
-87
lines changed

learning.py

Lines changed: 68 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,57 @@ def parse_csv(input, delim=','):
142142

143143
#______________________________________________________________________________
144144

145+
class CountingProbDist:
146+
"""A probability distribution formed by observing and counting examples.
147+
If p is an instance of this class and o is an observed value, then
148+
there are 3 main operations:
149+
p.add(o) increments the count for observation o by 1.
150+
p.sample() returns a random element from the distribution.
151+
p[o] returns the probability for o (as in a regular ProbDist)."""
152+
153+
def __init__(self, observations=[], default=0):
154+
"""Create a distribution, and optionally add in some observations.
155+
By default this is an unsmoothed distribution, but saying default=1,
156+
for example, gives you add-one smoothing."""
157+
update(self, dictionary={}, n_obs=0.0, default=default, sampler=None)
158+
for o in observations:
159+
self.add(o)
160+
161+
def add(self, o):
162+
"Add an observation o to the distribution."
163+
self.smooth_for(o)
164+
self.dictionary[o] += 1
165+
self.n_obs += 1
166+
self.sampler = None
167+
168+
def smooth_for(self, o):
169+
"""Include o among the possible observations, whether or not
170+
it's been observed yet."""
171+
if o not in self.dictionary:
172+
self.dictionary[o] = self.default
173+
self.n_obs += self.default
174+
self.sampler = None
175+
176+
def __getitem__(self, item):
177+
"Return an estimate of the probability of item."
178+
self.smooth_for(item)
179+
return self.dictionary[item] / self.n_obs
180+
181+
# (top() and sample() are not used in this module, but elsewhere.)
182+
183+
def top(self, n):
184+
"Return (count, obs) tuples for the n most frequent observations."
185+
return heapq.nlargest(n, [(v, k) for (k, v) in self.dictionary.items()])
186+
187+
def sample(self):
188+
"Return a random sample from the distribution."
189+
if self.sampler is None:
190+
self.sampler = weighted_sampler(self.dictionary.keys(),
191+
self.dictionary.values())
192+
return self.sampler()
193+
194+
#______________________________________________________________________________
195+
145196
def PluralityLearner(dataset):
146197
"""A very dumb algorithm: always pick the result that was most popular
147198
in the training data. Makes a baseline for comparison."""
@@ -154,48 +205,29 @@ def predict(example):
154205
#______________________________________________________________________________
155206

156207
def NaiveBayesLearner(dataset):
157-
"""Just count the target/attr/val occurrences.
158-
Count how many times each value of each input attribute occurs.
159-
Store count in _N[targetvalue][attr][val]. Let
160-
_N[targetvalue][attr][None] be the sum over all vals."""
161-
162-
_N = {}
163-
## Initialize to 0
164-
for gv in dataset.values[dataset.target]:
165-
_N[gv] = {}
166-
for attr in dataset.inputs:
167-
_N[gv][attr] = {}
168-
assert None not in dataset.values[attr]
169-
for val in dataset.values[attr]:
170-
_N[gv][attr][val] = 0
171-
_N[gv][attr][None] = 0
172-
## Go thru examples
208+
"""Just count how many times each value of each input attribute
209+
occurs, conditional on the target value. Count the different
210+
target values too."""
211+
212+
targetvals = dataset.values[dataset.target]
213+
target_dist = CountingProbDist(targetvals)
214+
attr_dists = dict(((gv, attr), CountingProbDist(dataset.values[attr]))
215+
for gv in targetvals
216+
for attr in dataset.inputs)
173217
for example in dataset.examples:
174-
Ngv = _N[example[dataset.target]]
218+
targetval = example[dataset.target]
219+
target_dist.add(targetval)
175220
for attr in dataset.inputs:
176-
Ngv[attr][example[attr]] += 1
177-
Ngv[attr][None] += 1
221+
attr_dists[targetval, attr].add(example[attr])
178222

179223
def predict(example):
180224
"""Predict the target value for example. Consider each possible value,
181-
choose the most likely, by looking at each attribute independently."""
182-
possible_values = dataset.values[dataset.target]
225+
and pick the most likely by looking at each attribute independently."""
183226
def class_probability(targetval):
184-
return product(P(targetval, a, example[a]) for a in dataset.inputs)
185-
return argmax(possible_values, class_probability)
186-
187-
def P(targetval, attr, attrval):
188-
"""Smooth the raw counts to give a probability estimate.
189-
Estimate adds 1 to numerator and len(possible vals) to denominator."""
190-
return ((N(targetval, attr, attrval) + 1.0) /
191-
(N(targetval, attr, None) + len(dataset.values[attr])))
192-
193-
def N(targetval, attr, attrval):
194-
"Return the count in the training data of this combination."
195-
try:
196-
return _N[targetval][attr][attrval]
197-
except KeyError:
198-
return 0
227+
return (target_dist[targetval]
228+
* product(attr_dists[targetval, attr][example[attr]]
229+
for attr in dataset.inputs))
230+
return argmax(targetvals, class_probability)
199231

200232
return predict
201233

text.py

Lines changed: 3 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -5,57 +5,9 @@
55
working on a tiny sample of Unix manual pages."""
66

77
from utils import *
8+
from learning import CountingProbDist
89
from math import log, exp
9-
import heapq, re, search
10-
11-
class CountingProbDist:
12-
"""A probability distribution formed by observing and counting examples.
13-
If p is an instance of this class and o is an observed value, then
14-
there are 3 main operations:
15-
p.add(o) increments the count for observation o by 1.
16-
p.sample() returns a random element from the distribution.
17-
p[o] returns the probability for o (as in a regular ProbDist)."""
18-
19-
def __init__(self, observations=[], default=0):
20-
"""Create a distribution, and optionally add in some observations.
21-
By default this is an unsmoothed distribution, but saying default=1,
22-
for example, gives you add-one smoothing."""
23-
update(self, dictionary={}, n_obs=0.0, default=default, sampler=None)
24-
for o in observations:
25-
self.add(o)
26-
27-
def add(self, o):
28-
"Add an observation o to the distribution."
29-
self.smooth_for(o)
30-
self.dictionary[o] += 1
31-
self.n_obs += 1
32-
self.sampler = None
33-
34-
def smooth_for(self, o):
35-
"""Include o among the possible observations, whether or not
36-
it's been observed yet."""
37-
if o not in self.dictionary:
38-
self.dictionary[o] = self.default
39-
self.n_obs += self.default
40-
self.sampler = None
41-
42-
def __getitem__(self, item):
43-
"Return an estimate of the probability of item."
44-
self.smooth_for(item)
45-
return self.dictionary[item] / self.n_obs
46-
47-
def top(self, n):
48-
"Return (count, obs) tuples for the n most frequent observations."
49-
return heapq.nlargest(n, [(v, k) for (k, v) in self.dictionary.items()])
50-
51-
def sample(self):
52-
"Return a random sample from the distribution."
53-
if self.sampler is None:
54-
self.sampler = weighted_sampler(self.dictionary.keys(),
55-
self.dictionary.values())
56-
return self.sampler()
57-
58-
#______________________________________________________________________________
10+
import re, search
5911

6012
class UnigramTextModel(CountingProbDist):
6113
"""This is a discrete probability distribution over words, so you
@@ -79,7 +31,7 @@ def __init__(self, n, observation_sequence=[]):
7931
self.cond_prob = DefaultDict(CountingProbDist())
8032
self.add_sequence(observation_sequence)
8133

82-
## sample, __getitem__ inherited from CountingProbDist
34+
## __getitem__, top, sample inherited from CountingProbDist
8335
## Note they deal with tuples, not strings, as inputs
8436

8537
def add(self, ngram):

0 commit comments

Comments
 (0)