Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 99f9637

Browse files
committed
Issue 2986: Add autojunk paramater to SequenceMatcher to turn off heuristic. Patch by Terry Reedy, Eli Bendersky, and Simon Cross
1 parent bd86301 commit 99f9637

4 files changed

Lines changed: 96 additions & 40 deletions

File tree

Doc/library/difflib.rst

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ can be used for example, for comparing files, and can produce difference
1717
information in various formats, including HTML and context and unified
1818
diffs. For comparing directories and files, see also, the :mod:`filecmp` module.
1919

20+
2021
.. class:: SequenceMatcher
2122

2223
This is a flexible class for comparing pairs of sequences of any type, so long
@@ -35,6 +36,14 @@ diffs. For comparing directories and files, see also, the :mod:`filecmp` module.
3536
complicated way on how many elements the sequences have in common; best case
3637
time is linear.
3738

39+
**Automatic junk heuristic:** :class:`SequenceMatcher` supports a heuristic that
40+
automatically treats certain sequence items as junk. The heuristic counts how many
41+
times each individual item appears in the sequence. If an item's duplicates (after
42+
the first one) account for more than 1% of the sequence and the sequence is at least
43+
200 items long, this item is marked as "popular" and is treated as junk for
44+
the purpose of sequence matching. This heuristic can be turned off by setting
45+
the ``autojunk`` argument to ``False`` when creating the :class:`SequenceMatcher`.
46+
3847

3948
.. class:: Differ
4049

@@ -324,7 +333,7 @@ SequenceMatcher Objects
324333
The :class:`SequenceMatcher` class has this constructor:
325334

326335

327-
.. class:: SequenceMatcher(isjunk=None, a='', b='')
336+
.. class:: SequenceMatcher(isjunk=None, a='', b='', autojunk=True)
328337

329338
Optional argument *isjunk* must be ``None`` (the default) or a one-argument
330339
function that takes a sequence element and returns true if and only if the
@@ -340,6 +349,9 @@ The :class:`SequenceMatcher` class has this constructor:
340349
The optional arguments *a* and *b* are sequences to be compared; both default to
341350
empty strings. The elements of both sequences must be :term:`hashable`.
342351

352+
The optional argument *autojunk* can be used to disable the automatic junk
353+
heuristic.
354+
343355
:class:`SequenceMatcher` objects have the following methods:
344356

345357

Lib/difflib.py

Lines changed: 36 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -150,7 +150,7 @@ class SequenceMatcher:
150150
Return an upper bound on ratio() very quickly.
151151
"""
152152

153-
def __init__(self, isjunk=None, a='', b=''):
153+
def __init__(self, isjunk=None, a='', b='', autojunk=True):
154154
"""Construct a SequenceMatcher.
155155
156156
Optional arg isjunk is None (the default), or a one-argument
@@ -168,6 +168,10 @@ def __init__(self, isjunk=None, a='', b=''):
168168
Optional arg b is the second of two sequences to be compared. By
169169
default, an empty string. The elements of b must be hashable. See
170170
also .set_seqs() and .set_seq2().
171+
172+
Optional arg autojunk should be set to False to disable the
173+
"automatic junk heuristic" that treats popular elements as junk
174+
(see module documentation for more information).
171175
"""
172176

173177
# Members:
@@ -206,11 +210,13 @@ def __init__(self, isjunk=None, a='', b=''):
206210
# DOES NOT WORK for x in a!
207211
# isbpopular
208212
# for x in b, isbpopular(x) is true iff b is reasonably long
209-
# (at least 200 elements) and x accounts for more than 1% of
210-
# its elements. DOES NOT WORK for x in a!
213+
# (at least 200 elements) and x accounts for more than 1 + 1% of
214+
# its elements (when autojunk is enabled).
215+
# DOES NOT WORK for x in a!
211216

212217
self.isjunk = isjunk
213218
self.a = self.b = None
219+
self.autojunk = autojunk
214220
self.set_seqs(a, b)
215221

216222
def set_seqs(self, a, b):
@@ -287,7 +293,7 @@ def set_seq2(self, b):
287293
# from starting any matching block at a junk element ...
288294
# also creates the fast isbjunk function ...
289295
# b2j also does not contain entries for "popular" elements, meaning
290-
# elements that account for more than 1% of the total elements, and
296+
# elements that account for more than 1 + 1% of the total elements, and
291297
# when the sequence is reasonably large (>= 200 elements); this can
292298
# be viewed as an adaptive notion of semi-junk, and yields an enormous
293299
# speedup when, e.g., comparing program files with hundreds of
@@ -308,44 +314,37 @@ def __chain_b(self):
308314
# out the junk later is much cheaper than building b2j "right"
309315
# from the start.
310316
b = self.b
311-
n = len(b)
312317
self.b2j = b2j = {}
313-
populardict = {}
314-
for i, elt in enumerate(b):
315-
if elt in b2j:
316-
indices = b2j[elt]
317-
if n >= 200 and len(indices) * 100 > n:
318-
populardict[elt] = 1
319-
del indices[:]
320-
else:
321-
indices.append(i)
322-
else:
323-
b2j[elt] = [i]
324318

325-
# Purge leftover indices for popular elements.
326-
for elt in populardict:
327-
del b2j[elt]
319+
for i, elt in enumerate(b):
320+
indices = b2j.setdefault(elt, [])
321+
indices.append(i)
328322

329-
# Now b2j.keys() contains elements uniquely, and especially when
330-
# the sequence is a string, that's usually a good deal smaller
331-
# than len(string). The difference is the number of isjunk calls
332-
# saved.
323+
# Purge junk elements
324+
junk = set()
333325
isjunk = self.isjunk
334-
junkdict = {}
335326
if isjunk:
336-
for d in populardict, b2j:
337-
for elt in list(d.keys()):
338-
if isjunk(elt):
339-
junkdict[elt] = 1
340-
del d[elt]
341-
342-
# Now for x in b, isjunk(x) == x in junkdict, but the
343-
# latter is much faster. Note too that while there may be a
344-
# lot of junk in the sequence, the number of *unique* junk
345-
# elements is probably small. So the memory burden of keeping
346-
# this dict alive is likely trivial compared to the size of b2j.
347-
self.isbjunk = junkdict.__contains__
348-
self.isbpopular = populardict.__contains__
327+
for elt in list(b2j.keys()): # using list() since b2j is modified
328+
if isjunk(elt):
329+
junk.add(elt)
330+
del b2j[elt]
331+
332+
# Purge popular elements that are not junk
333+
popular = set()
334+
n = len(b)
335+
if self.autojunk and n >= 200:
336+
ntest = n // 100 + 1
337+
for elt, idxs in list(b2j.items()):
338+
if len(idxs) > ntest:
339+
popular.add(elt)
340+
del b2j[elt]
341+
342+
# Now for x in b, isjunk(x) == x in junk, but the latter is much faster.
343+
# Since the number of *unique* junk elements is probably small, the
344+
# memory burden of keeping this set alive is likely trivial compared to
345+
# the size of b2j.
346+
self.isbjunk = junk.__contains__
347+
self.isbpopular = popular.__contains__
349348

350349
def find_longest_match(self, alo, ahi, blo, bhi):
351350
"""Find longest matching block in a[alo:ahi] and b[blo:bhi].

Lib/test/test_difflib.py

Lines changed: 43 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,47 @@
44
import doctest
55
import sys
66

7-
class TestSFbugs(unittest.TestCase):
87

8+
class TestWithAscii(unittest.TestCase):
9+
def test_one_insert(self):
10+
sm = difflib.SequenceMatcher(None, 'b' * 100, 'a' + 'b' * 100)
11+
self.assertAlmostEqual(sm.ratio(), 0.995, places=3)
12+
self.assertEqual(list(sm.get_opcodes()),
13+
[ ('insert', 0, 0, 0, 1),
14+
('equal', 0, 100, 1, 101)])
15+
sm = difflib.SequenceMatcher(None, 'b' * 100, 'b' * 50 + 'a' + 'b' * 50)
16+
self.assertAlmostEqual(sm.ratio(), 0.995, places=3)
17+
self.assertEqual(list(sm.get_opcodes()),
18+
[ ('equal', 0, 50, 0, 50),
19+
('insert', 50, 50, 50, 51),
20+
('equal', 50, 100, 51, 101)])
21+
22+
def test_one_delete(self):
23+
sm = difflib.SequenceMatcher(None, 'a' * 40 + 'c' + 'b' * 40, 'a' * 40 + 'b' * 40)
24+
self.assertAlmostEqual(sm.ratio(), 0.994, places=3)
25+
self.assertEqual(list(sm.get_opcodes()),
26+
[ ('equal', 0, 40, 0, 40),
27+
('delete', 40, 41, 40, 40),
28+
('equal', 41, 81, 40, 80)])
29+
30+
31+
class TestAutojunk(unittest.TestCase):
32+
"""Tests for the autojunk parameter added in 2.7"""
33+
def test_one_insert_homogenous_sequence(self):
34+
# By default autojunk=True and the heuristic kicks in for a sequence
35+
# of length 200+
36+
seq1 = 'b' * 200
37+
seq2 = 'a' + 'b' * 200
38+
39+
sm = difflib.SequenceMatcher(None, seq1, seq2)
40+
self.assertAlmostEqual(sm.ratio(), 0, places=3)
41+
42+
# Now turn the heuristic off
43+
sm = difflib.SequenceMatcher(None, seq1, seq2, autojunk=False)
44+
self.assertAlmostEqual(sm.ratio(), 0.9975, places=3)
45+
46+
47+
class TestSFbugs(unittest.TestCase):
948
def test_ratio_for_null_seqn(self):
1049
# Check clearing of SF bug 763023
1150
s = difflib.SequenceMatcher(None, [], [])
@@ -184,7 +223,9 @@ def test_no_trailing_tab_on_empty_filedate(self):
184223
def test_main():
185224
difflib.HtmlDiff._default_prefix = 0
186225
Doctests = doctest.DocTestSuite(difflib)
187-
run_unittest(TestSFpatches, TestSFbugs, TestOutputFormat, Doctests)
226+
run_unittest(
227+
TestWithAscii, TestAutojunk, TestSFpatches, TestSFbugs,
228+
TestOutputFormat, Doctests)
188229

189230
if __name__ == '__main__':
190231
test_main()

Misc/NEWS

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,10 @@ Core and Builtins
3737
Library
3838
-------
3939

40+
- Issue #2986: difflib.SequenceMatcher gets a new parameter, autojunk, which
41+
can be set to False to turn off the previously undocumented 'popularity'
42+
heuristic. Patch by Terry Reedy and Eli Bendersky.
43+
4044
- Issue #9846: zipfile is now correctly closing underlying file objects.
4145

4246
- Issue #10459: Update CJK character names to Unicode 6.0.

0 commit comments

Comments
 (0)