@@ -20,17 +20,17 @@ def find_frequent_itemsets(transactions, minimum_support):
20
20
"""
21
21
Finds frequent itemsets in the given transactions using FP-growth. This
22
22
function returns a generator instead of an eagerly-populated list of items.
23
-
23
+
24
24
The `transactions` parameter can be any iterable of iterables of items.
25
25
`minimum_support` should be an integer specifying the minimum number of
26
26
occurrences of an itemset for it to be accepted.
27
-
27
+
28
28
Each item must be hashable (i.e., it must be valid as a member of a
29
29
dictionary or a set).
30
30
"""
31
31
items = defaultdict (lambda : 0 ) # mapping from items to their supports
32
32
processed_transactions = []
33
-
33
+
34
34
# Load the passed-in transactions and count the support that individual
35
35
# items have.
36
36
for transaction in transactions :
@@ -39,71 +39,71 @@ def find_frequent_itemsets(transactions, minimum_support):
39
39
items [item ] += 1
40
40
processed .append (item )
41
41
processed_transactions .append (processed )
42
-
42
+
43
43
# Remove infrequent items from the item support dictionary.
44
44
items = dict (items )
45
45
for item , support in items .items ():
46
46
if support < minimum_support :
47
47
del items [item ]
48
-
48
+
49
49
# Build our FP-tree. Before any transactions can be added to the tree, they
50
50
# must be stripped of infrequent items and their surviving items must be
51
51
# sorted in decreasing order of frequency.
52
52
def clean_transaction (transaction ):
53
53
transaction = filter (lambda v : v in items , transaction )
54
54
transaction .sort (key = lambda v : items [v ], reverse = True )
55
55
return transaction
56
-
56
+
57
57
master = FPTree ()
58
58
for transaction in imap (clean_transaction , processed_transactions ):
59
59
master .add (transaction )
60
-
60
+
61
61
def find_with_suffix (tree , suffix ):
62
62
for item , nodes in tree .items ():
63
63
support = sum (n .count for n in nodes )
64
64
if support >= minimum_support and item not in suffix :
65
65
# New winner!
66
66
found_set = [item ] + suffix
67
67
yield found_set
68
-
68
+
69
69
# Build a conditional tree and recursively search for frequent
70
70
# itemsets within it.
71
71
cond_tree = conditional_tree_from_paths (tree .prefix_paths (item ),
72
72
minimum_support )
73
73
for s in find_with_suffix (cond_tree , found_set ):
74
74
yield s # pass along the good news to our caller
75
-
75
+
76
76
# Search for frequent itemsets, and yield the results we find.
77
77
for s in find_with_suffix (master , []):
78
78
yield s
79
79
80
80
class FPTree (object ):
81
81
"""
82
82
An FP tree.
83
-
83
+
84
84
This object may only store transaction items that are hashable (i.e., all
85
85
items must be valid as dictionary keys or set members).
86
86
"""
87
87
def __init__ (self ):
88
88
# The root node of the tree.
89
89
self ._root = FPNode (self , None , None )
90
-
90
+
91
91
# A dictionary mapping items to the head and tail of a path of
92
92
# "neighbors" that will hit every node containing that item.
93
93
self ._routes = {}
94
-
94
+
95
95
@property
96
96
def root (self ):
97
97
"""The root node of the tree."""
98
98
return self ._root
99
-
99
+
100
100
def add (self , transaction ):
101
101
"""
102
102
Adds a transaction to the tree.
103
103
"""
104
-
104
+
105
105
point = self ._root
106
-
106
+
107
107
for item in transaction :
108
108
next_point = point .search (item )
109
109
if next_point :
@@ -115,25 +115,25 @@ def add(self, transaction):
115
115
# currently looking at.
116
116
next_point = FPNode (self , item )
117
117
point .add (next_point )
118
-
118
+
119
119
# Update the route of nodes that contain this item to include
120
120
# our new node.
121
121
self ._update_route (next_point )
122
-
122
+
123
123
point = next_point
124
-
124
+
125
125
def _update_route (self , point ):
126
126
"""Add the given node to the route through all nodes for its item."""
127
127
assert self is point .tree
128
-
128
+
129
129
try :
130
130
route = self ._routes [point .item ]
131
131
route [1 ].neighbor = point # route[1] is the tail
132
132
route [1 ] = point
133
133
except KeyError :
134
134
# First node for this item; start a new route.
135
135
self ._routes [point .item ] = [point , point ]
136
-
136
+
137
137
def items (self ):
138
138
"""
139
139
Generate one 2-tuples for each item represented in the tree. The first
@@ -142,37 +142,37 @@ def items(self):
142
142
"""
143
143
for item in self ._routes .iterkeys ():
144
144
yield (item , self .nodes (item ))
145
-
145
+
146
146
def nodes (self , item ):
147
147
"""
148
148
Generates the sequence of nodes that contain the given item.
149
149
"""
150
-
150
+
151
151
try :
152
152
node = self ._routes [item ][0 ]
153
153
except KeyError :
154
154
return
155
-
155
+
156
156
while node :
157
157
yield node
158
158
node = node .neighbor
159
-
159
+
160
160
def prefix_paths (self , item ):
161
161
"""Generates the prefix paths that end with the given item."""
162
-
162
+
163
163
def collect_path (node ):
164
164
path = []
165
165
while node and not node .root :
166
166
path .append (node )
167
167
node = node .parent
168
168
path .reverse ()
169
169
return path
170
-
170
+
171
171
return (collect_path (node ) for node in self .nodes (item ))
172
-
172
+
173
173
def _removed (self , node ):
174
174
"""Called when `node` is removed from the tree; performs cleanup."""
175
-
175
+
176
176
head , tail = self ._routes [node .item ]
177
177
if node is head :
178
178
if node is tail or not node .neighbor :
@@ -193,14 +193,14 @@ def conditional_tree_from_paths(paths, minimum_support):
193
193
tree = FPTree ()
194
194
condition_item = None
195
195
items = set ()
196
-
196
+
197
197
# Import the nodes in the paths into the new tree. Only the counts of the
198
198
# leaf notes matter; the remaining counts will be reconstructed from the
199
199
# leaf counts.
200
200
for path in paths :
201
201
if condition_item is None :
202
202
condition_item = path [- 1 ].item
203
-
203
+
204
204
point = tree .root
205
205
for node in path :
206
206
next_point = point .search (node .item )
@@ -212,17 +212,17 @@ def conditional_tree_from_paths(paths, minimum_support):
212
212
point .add (next_point )
213
213
tree ._update_route (next_point )
214
214
point = next_point
215
-
215
+
216
216
assert condition_item is not None
217
-
217
+
218
218
# Calculate the counts of the non-leaf nodes.
219
219
for path in tree .prefix_paths (condition_item ):
220
220
count = None
221
221
for node in reversed (path ):
222
222
if count is not None :
223
223
node ._count += count
224
224
count = node .count
225
-
225
+
226
226
# Eliminate the nodes for any items that are no longer frequent.
227
227
for item in items :
228
228
support = sum (n .count for n in tree .nodes (item ))
@@ -231,47 +231,47 @@ def conditional_tree_from_paths(paths, minimum_support):
231
231
for node in tree .nodes (item ):
232
232
if node .parent is not None :
233
233
node .parent .remove (node )
234
-
234
+
235
235
# Finally, remove the nodes corresponding to the item for which this
236
236
# conditional tree was generated.
237
237
for node in tree .nodes (condition_item ):
238
238
if node .parent is not None : # the node might already be an orphan
239
239
node .parent .remove (node )
240
-
240
+
241
241
return tree
242
-
242
+
243
243
class FPNode (object ):
244
244
"""A node in an FP tree."""
245
-
245
+
246
246
def __init__ (self , tree , item , count = 1 ):
247
247
self ._tree = tree
248
248
self ._item = item
249
249
self ._count = count
250
250
self ._parent = None
251
251
self ._children = {}
252
252
self ._neighbor = None
253
-
253
+
254
254
def add (self , child ):
255
255
"""Adds the given FPNode `child` as a child of this node."""
256
-
256
+
257
257
if not isinstance (child , FPNode ):
258
258
raise TypeError ("Can only add other FPNodes as children" )
259
-
259
+
260
260
if not child .item in self ._children :
261
261
self ._children [child .item ] = child
262
262
child .parent = self
263
-
263
+
264
264
def search (self , item ):
265
265
"""
266
266
Checks to see if this node contains a child node for the given item.
267
267
If so, that node is returned; otherwise, `None` is returned.
268
268
"""
269
-
269
+
270
270
try :
271
271
return self ._children [item ]
272
272
except KeyError :
273
273
return None
274
-
274
+
275
275
def remove (self , child ):
276
276
try :
277
277
if self ._children [child .item ] is child :
@@ -293,41 +293,41 @@ def remove(self, child):
293
293
raise ValueError ("that node is not a child of this node" )
294
294
except KeyError :
295
295
raise ValueError ("that node is not a child of this node" )
296
-
296
+
297
297
def __contains__ (self , item ):
298
298
return item in self ._children
299
-
299
+
300
300
@property
301
301
def tree (self ):
302
302
"""The tree in which this node appears."""
303
303
return self ._tree
304
-
304
+
305
305
@property
306
306
def item (self ):
307
307
"""The item contained in this node."""
308
308
return self ._item
309
-
309
+
310
310
@property
311
311
def count (self ):
312
312
"""The count associated with this node's item."""
313
313
return self ._count
314
-
314
+
315
315
def increment (self ):
316
316
"""Increments the count associated with this node's item."""
317
317
if self ._count is None :
318
318
raise ValueError ("Root nodes have no associated count." )
319
319
self ._count += 1
320
-
320
+
321
321
@property
322
322
def root (self ):
323
323
"""True if this node is the root of a tree; false if otherwise."""
324
324
return self ._item is None and self ._count is None
325
-
325
+
326
326
@property
327
327
def leaf (self ):
328
328
"""True if this node is a leaf in the tree; false if otherwise."""
329
329
return len (self ._children ) == 0
330
-
330
+
331
331
def parent ():
332
332
doc = "The node's parent."
333
333
def fget (self ):
@@ -340,7 +340,7 @@ def fset(self, value):
340
340
self ._parent = value
341
341
return locals ()
342
342
parent = property (** parent ())
343
-
343
+
344
344
def neighbor ():
345
345
doc = """
346
346
The node's neighbor; the one with the same value that is "to the right"
@@ -356,13 +356,13 @@ def fset(self, value):
356
356
self ._neighbor = value
357
357
return locals ()
358
358
neighbor = property (** neighbor ())
359
-
359
+
360
360
@property
361
361
def children (self ):
362
362
"""The nodes that are children of this node."""
363
363
return tuple (self ._children .itervalues ())
364
-
365
-
364
+
365
+
366
366
def __repr__ (self ):
367
367
if self .root :
368
368
return "<%s (root)>" % type (self ).__name__
@@ -372,16 +372,16 @@ def __repr__(self):
372
372
if __name__ == '__main__' :
373
373
from optparse import OptionParser
374
374
import csv
375
-
375
+
376
376
p = OptionParser (usage = '%prog data_file' )
377
377
p .add_option ('-s' , '--minimum-support' , dest = 'minsup' , type = 'int' ,
378
378
help = 'Minimum itemset support (default: 2)' )
379
379
p .set_defaults (minsup = 2 )
380
-
380
+
381
381
options , args = p .parse_args ()
382
382
if len (args ) < 1 :
383
383
p .error ('must provide the path to a CSV file to read' )
384
-
384
+
385
385
f = open (args [0 ])
386
386
try :
387
387
for itemset in find_frequent_itemsets (csv .reader (f ), options .minsup ):
0 commit comments