@@ -150,7 +150,7 @@ class SequenceMatcher:
150150 Return an upper bound on ratio() very quickly.
151151 """
152152
153- def __init__ (self , isjunk = None , a = '' , b = '' ):
153+ def __init__ (self , isjunk = None , a = '' , b = '' , autojunk = True ):
154154 """Construct a SequenceMatcher.
155155
156156 Optional arg isjunk is None (the default), or a one-argument
@@ -168,6 +168,10 @@ def __init__(self, isjunk=None, a='', b=''):
168168 Optional arg b is the second of two sequences to be compared. By
169169 default, an empty string. The elements of b must be hashable. See
170170 also .set_seqs() and .set_seq2().
171+
172+ Optional arg autojunk should be set to False to disable the
173+ "automatic junk heuristic" that treats popular elements as junk
174+ (see module documentation for more information).
171175 """
172176
173177 # Members:
@@ -206,11 +210,13 @@ def __init__(self, isjunk=None, a='', b=''):
206210 # DOES NOT WORK for x in a!
207211 # isbpopular
208212 # for x in b, isbpopular(x) is true iff b is reasonably long
209- # (at least 200 elements) and x accounts for more than 1% of
210- # its elements. DOES NOT WORK for x in a!
213+ # (at least 200 elements) and x accounts for more than 1 + 1% of
214+ # its elements (when autojunk is enabled).
215+ # DOES NOT WORK for x in a!
211216
212217 self .isjunk = isjunk
213218 self .a = self .b = None
219+ self .autojunk = autojunk
214220 self .set_seqs (a , b )
215221
216222 def set_seqs (self , a , b ):
@@ -287,7 +293,7 @@ def set_seq2(self, b):
287293 # from starting any matching block at a junk element ...
288294 # also creates the fast isbjunk function ...
289295 # b2j also does not contain entries for "popular" elements, meaning
290- # elements that account for more than 1% of the total elements, and
296+ # elements that account for more than 1 + 1 % of the total elements, and
291297 # when the sequence is reasonably large (>= 200 elements); this can
292298 # be viewed as an adaptive notion of semi-junk, and yields an enormous
293299 # speedup when, e.g., comparing program files with hundreds of
@@ -308,44 +314,37 @@ def __chain_b(self):
308314 # out the junk later is much cheaper than building b2j "right"
309315 # from the start.
310316 b = self .b
311- n = len (b )
312317 self .b2j = b2j = {}
313- populardict = {}
314- for i , elt in enumerate (b ):
315- if elt in b2j :
316- indices = b2j [elt ]
317- if n >= 200 and len (indices ) * 100 > n :
318- populardict [elt ] = 1
319- del indices [:]
320- else :
321- indices .append (i )
322- else :
323- b2j [elt ] = [i ]
324318
325- # Purge leftover indices for popular elements.
326- for elt in populardict :
327- del b2j [ elt ]
319+ for i , elt in enumerate ( b ):
320+ indices = b2j . setdefault ( elt , [])
321+ indices . append ( i )
328322
329- # Now b2j.keys() contains elements uniquely, and especially when
330- # the sequence is a string, that's usually a good deal smaller
331- # than len(string). The difference is the number of isjunk calls
332- # saved.
323+ # Purge junk elements
324+ junk = set ()
333325 isjunk = self .isjunk
334- junkdict = {}
335326 if isjunk :
336- for d in populardict , b2j :
337- for elt in list (d .keys ()):
338- if isjunk (elt ):
339- junkdict [elt ] = 1
340- del d [elt ]
341-
342- # Now for x in b, isjunk(x) == x in junkdict, but the
343- # latter is much faster. Note too that while there may be a
344- # lot of junk in the sequence, the number of *unique* junk
345- # elements is probably small. So the memory burden of keeping
346- # this dict alive is likely trivial compared to the size of b2j.
347- self .isbjunk = junkdict .__contains__
348- self .isbpopular = populardict .__contains__
327+ for elt in list (b2j .keys ()): # using list() since b2j is modified
328+ if isjunk (elt ):
329+ junk .add (elt )
330+ del b2j [elt ]
331+
332+ # Purge popular elements that are not junk
333+ popular = set ()
334+ n = len (b )
335+ if self .autojunk and n >= 200 :
336+ ntest = n // 100 + 1
337+ for elt , idxs in list (b2j .items ()):
338+ if len (idxs ) > ntest :
339+ popular .add (elt )
340+ del b2j [elt ]
341+
342+ # Now for x in b, isjunk(x) == x in junk, but the latter is much faster.
343+ # Since the number of *unique* junk elements is probably small, the
344+ # memory burden of keeping this set alive is likely trivial compared to
345+ # the size of b2j.
346+ self .isbjunk = junk .__contains__
347+ self .isbpopular = popular .__contains__
349348
350349 def find_longest_match (self , alo , ahi , blo , bhi ):
351350 """Find longest matching block in a[alo:ahi] and b[blo:bhi].
0 commit comments