@@ -142,6 +142,57 @@ def parse_csv(input, delim=','):
142
142
143
143
#______________________________________________________________________________
144
144
145
+ class CountingProbDist :
146
+ """A probability distribution formed by observing and counting examples.
147
+ If p is an instance of this class and o is an observed value, then
148
+ there are 3 main operations:
149
+ p.add(o) increments the count for observation o by 1.
150
+ p.sample() returns a random element from the distribution.
151
+ p[o] returns the probability for o (as in a regular ProbDist)."""
152
+
153
+ def __init__ (self , observations = [], default = 0 ):
154
+ """Create a distribution, and optionally add in some observations.
155
+ By default this is an unsmoothed distribution, but saying default=1,
156
+ for example, gives you add-one smoothing."""
157
+ update (self , dictionary = {}, n_obs = 0.0 , default = default , sampler = None )
158
+ for o in observations :
159
+ self .add (o )
160
+
161
+ def add (self , o ):
162
+ "Add an observation o to the distribution."
163
+ self .smooth_for (o )
164
+ self .dictionary [o ] += 1
165
+ self .n_obs += 1
166
+ self .sampler = None
167
+
168
+ def smooth_for (self , o ):
169
+ """Include o among the possible observations, whether or not
170
+ it's been observed yet."""
171
+ if o not in self .dictionary :
172
+ self .dictionary [o ] = self .default
173
+ self .n_obs += self .default
174
+ self .sampler = None
175
+
176
+ def __getitem__ (self , item ):
177
+ "Return an estimate of the probability of item."
178
+ self .smooth_for (item )
179
+ return self .dictionary [item ] / self .n_obs
180
+
181
+ # (top() and sample() are not used in this module, but elsewhere.)
182
+
183
+ def top (self , n ):
184
+ "Return (count, obs) tuples for the n most frequent observations."
185
+ return heapq .nlargest (n , [(v , k ) for (k , v ) in self .dictionary .items ()])
186
+
187
+ def sample (self ):
188
+ "Return a random sample from the distribution."
189
+ if self .sampler is None :
190
+ self .sampler = weighted_sampler (self .dictionary .keys (),
191
+ self .dictionary .values ())
192
+ return self .sampler ()
193
+
194
+ #______________________________________________________________________________
195
+
145
196
def PluralityLearner (dataset ):
146
197
"""A very dumb algorithm: always pick the result that was most popular
147
198
in the training data. Makes a baseline for comparison."""
@@ -154,48 +205,29 @@ def predict(example):
154
205
#______________________________________________________________________________
155
206
156
207
def NaiveBayesLearner (dataset ):
157
- """Just count the target/attr/val occurrences.
158
- Count how many times each value of each input attribute occurs.
159
- Store count in _N[targetvalue][attr][val]. Let
160
- _N[targetvalue][attr][None] be the sum over all vals."""
161
-
162
- _N = {}
163
- ## Initialize to 0
164
- for gv in dataset .values [dataset .target ]:
165
- _N [gv ] = {}
166
- for attr in dataset .inputs :
167
- _N [gv ][attr ] = {}
168
- assert None not in dataset .values [attr ]
169
- for val in dataset .values [attr ]:
170
- _N [gv ][attr ][val ] = 0
171
- _N [gv ][attr ][None ] = 0
172
- ## Go thru examples
208
+ """Just count how many times each value of each input attribute
209
+ occurs, conditional on the target value. Count the different
210
+ target values too."""
211
+
212
+ targetvals = dataset .values [dataset .target ]
213
+ target_dist = CountingProbDist (targetvals )
214
+ attr_dists = dict (((gv , attr ), CountingProbDist (dataset .values [attr ]))
215
+ for gv in targetvals
216
+ for attr in dataset .inputs )
173
217
for example in dataset .examples :
174
- Ngv = _N [example [dataset .target ]]
218
+ targetval = example [dataset .target ]
219
+ target_dist .add (targetval )
175
220
for attr in dataset .inputs :
176
- Ngv [attr ][example [attr ]] += 1
177
- Ngv [attr ][None ] += 1
221
+ attr_dists [targetval , attr ].add (example [attr ])
178
222
179
223
def predict (example ):
180
224
"""Predict the target value for example. Consider each possible value,
181
- choose the most likely, by looking at each attribute independently."""
182
- possible_values = dataset .values [dataset .target ]
225
+ and pick the most likely by looking at each attribute independently."""
183
226
def class_probability (targetval ):
184
- return product (P (targetval , a , example [a ]) for a in dataset .inputs )
185
- return argmax (possible_values , class_probability )
186
-
187
- def P (targetval , attr , attrval ):
188
- """Smooth the raw counts to give a probability estimate.
189
- Estimate adds 1 to numerator and len(possible vals) to denominator."""
190
- return ((N (targetval , attr , attrval ) + 1.0 ) /
191
- (N (targetval , attr , None ) + len (dataset .values [attr ])))
192
-
193
- def N (targetval , attr , attrval ):
194
- "Return the count in the training data of this combination."
195
- try :
196
- return _N [targetval ][attr ][attrval ]
197
- except KeyError :
198
- return 0
227
+ return (target_dist [targetval ]
228
+ * product (attr_dists [targetval , attr ][example [attr ]]
229
+ for attr in dataset .inputs ))
230
+ return argmax (targetvals , class_probability )
199
231
200
232
return predict
201
233
0 commit comments