@@ -23,8 +23,61 @@ def capfirst(x):
2323 return x [0 ].upper () + x [1 :]
2424
2525
26- # Set up regular expressions
27- re_words = _lazy_re_compile (r"<[^>]+?>|([^<>\s]+)" , re .S )
26+ # ----- Begin security-related performance workaround -----
27+
28+ # We used to have, below
29+ #
30+ # re_words = _lazy_re_compile(r"<[^>]+?>|([^<>\s]+)", re.S)
31+ #
32+ # But it was shown that this regex, in the way we use it here, has some
33+ # catastrophic edge-case performance features. Namely, when it is applied to
34+ # text with only open brackets "<<<...". The class below provides the services
35+ # and correct answers for the use cases, but in these edge cases does it much
36+ # faster.
37+ re_notag = _lazy_re_compile (r"([^<>\s]+)" , re .S )
38+ re_prt = _lazy_re_compile (r"<|([^<>\s]+)" , re .S )
39+
40+
41+ class WordsRegex :
42+ @staticmethod
43+ def search (text , pos ):
44+ # Look for "<" or a non-tag word.
45+ partial = re_prt .search (text , pos )
46+ if partial is None or partial [1 ] is not None :
47+ return partial
48+
49+ # "<" was found, look for a closing ">".
50+ end = text .find (">" , partial .end (0 ))
51+ if end < 0 :
52+ # ">" cannot be found, look for a word.
53+ return re_notag .search (text , pos + 1 )
54+ else :
55+ # "<" followed by a ">" was found -- fake a match.
56+ end += 1
57+ return FakeMatch (text [partial .start (0 ) : end ], end )
58+
59+
60+ class FakeMatch :
61+ __slots__ = ["_text" , "_end" ]
62+
63+ def end (self , group = 0 ):
64+ assert group == 0 , "This specific object takes only group=0"
65+ return self ._end
66+
67+ def __getitem__ (self , group ):
68+ if group == 1 :
69+ return None
70+ assert group == 0 , "This specific object takes only group in {0,1}"
71+ return self ._text
72+
73+ def __init__ (self , text , end ):
74+ self ._text , self ._end = text , end
75+
76+
77+ # ----- End security-related performance workaround -----
78+
79+ # Set up regular expressions.
80+ re_words = WordsRegex
2881re_chars = _lazy_re_compile (r"<[^>]+?>|(.)" , re .S )
2982re_tag = _lazy_re_compile (r"<(/)?(\S+?)(?:(\s*/)|\s.*?)?>" , re .S )
3083re_newlines = _lazy_re_compile (r"\r\n|\r" ) # Used in normalize_newlines
0 commit comments