Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 31e1ed0

Browse files
authored
difflib.py: improve the run time of pathological diffs
1 parent 4702b7b commit 31e1ed0

1 file changed

Lines changed: 18 additions & 5 deletions

File tree

Lib/difflib.py

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -890,7 +890,7 @@ def _plain_replace(self, a, alo, ahi, b, blo, bhi):
890890
for g in first, second:
891891
yield from g
892892

893-
def _fancy_replace(self, a, alo, ahi, b, blo, bhi):
893+
def _fancy_replace(self, a, alo, ahi, b, blo, bhi, _gravity=1e-6):
894894
r"""
895895
When replacing one block of lines with another, search the blocks
896896
for *similar* lines; the best-matching pair (if any) is used as a
@@ -918,26 +918,39 @@ def _fancy_replace(self, a, alo, ahi, b, blo, bhi):
918918
# search for the pair that matches best without being identical
919919
# (identical lines must be junk lines, & we don't want to synch up
920920
# on junk -- unless we have to)
921+
922+
# for pathological cases with many equal ratios prefer to split
923+
# closer to the middle of a, b chunks such that the resulting
924+
# branching is more optimal (bisect-like)
925+
def _drag_to_center(i, lo, hi):
926+
# any convex function with a maximum at (lo + hi - 1) / 2
927+
# this one is zero at edges lo, hi - 1 and _gravity in the middle
928+
return _gravity * (1 - ((2 * i - lo - hi + 1) / (hi - lo - 1)) ** 2)
929+
# with the weight above, the best_ratio becomes slightly bigger
930+
# which means that the real cutoff is slightly smaller than 0.75
931+
921932
for j in range(blo, bhi):
922933
bj = b[j]
923934
cruncher.set_seq2(bj)
935+
weight_b = _drag_to_center(j, blo, bhi)
924936
for i in range(alo, ahi):
925937
ai = a[i]
926938
if ai == bj:
927939
if eqi is None:
928940
eqi, eqj = i, j
929941
continue
930942
cruncher.set_seq1(ai)
943+
weight_ab = weight_b + _drag_to_center(i, alo, ahi)
931944
# computing similarity is expensive, so use the quick
932945
# upper bounds first -- have seen this speed up messy
933946
# compares by a factor of 3.
934947
# note that ratio() is only expensive to compute the first
935948
# time it's called on a sequence pair; the expensive part
936949
# of the computation is cached by cruncher
937-
if cruncher.real_quick_ratio() > best_ratio and \
938-
cruncher.quick_ratio() > best_ratio and \
939-
cruncher.ratio() > best_ratio:
940-
best_ratio, best_i, best_j = cruncher.ratio(), i, j
950+
if cruncher.real_quick_ratio() + weight_ab > best_ratio and \
951+
cruncher.quick_ratio() + weight_ab > best_ratio and \
952+
cruncher.ratio() + weight_ab > best_ratio:
953+
best_ratio, best_i, best_j = cruncher.ratio() + weight_ab, i, j
941954
if best_ratio < cutoff:
942955
# no non-identical "pretty close" pair
943956
if eqi is None:

0 commit comments

Comments
 (0)