@@ -23,8 +23,61 @@ def capfirst(x):
23
23
return x [0 ].upper () + x [1 :]
24
24
25
25
26
- # Set up regular expressions
27
- re_words = _lazy_re_compile (r"<[^>]+?>|([^<>\s]+)" , re .S )
26
+ # ----- Begin security-related performance workaround -----
27
+
28
+ # We used to have, below
29
+ #
30
+ # re_words = _lazy_re_compile(r"<[^>]+?>|([^<>\s]+)", re.S)
31
+ #
32
+ # But it was shown that this regex, in the way we use it here, has some
33
+ # catastrophic edge-case performance features. Namely, when it is applied to
34
+ # text with only open brackets "<<<...". The class below provides the services
35
+ # and correct answers for the use cases, but in these edge cases does it much
36
+ # faster.
37
+ re_notag = _lazy_re_compile (r"([^<>\s]+)" , re .S )
38
+ re_prt = _lazy_re_compile (r"<|([^<>\s]+)" , re .S )
39
+
40
+
41
+ class WordsRegex :
42
+ @staticmethod
43
+ def search (text , pos ):
44
+ # Look for "<" or a non-tag word.
45
+ partial = re_prt .search (text , pos )
46
+ if partial is None or partial [1 ] is not None :
47
+ return partial
48
+
49
+ # "<" was found, look for a closing ">".
50
+ end = text .find (">" , partial .end (0 ))
51
+ if end < 0 :
52
+ # ">" cannot be found, look for a word.
53
+ return re_notag .search (text , pos + 1 )
54
+ else :
55
+ # "<" followed by a ">" was found -- fake a match.
56
+ end += 1
57
+ return FakeMatch (text [partial .start (0 ) : end ], end )
58
+
59
+
60
+ class FakeMatch :
61
+ __slots__ = ["_text" , "_end" ]
62
+
63
+ def end (self , group = 0 ):
64
+ assert group == 0 , "This specific object takes only group=0"
65
+ return self ._end
66
+
67
+ def __getitem__ (self , group ):
68
+ if group == 1 :
69
+ return None
70
+ assert group == 0 , "This specific object takes only group in {0,1}"
71
+ return self ._text
72
+
73
+ def __init__ (self , text , end ):
74
+ self ._text , self ._end = text , end
75
+
76
+
77
+ # ----- End security-related performance workaround -----
78
+
79
+ # Set up regular expressions.
80
+ re_words = WordsRegex
28
81
re_chars = _lazy_re_compile (r"<[^>]+?>|(.)" , re .S )
29
82
re_tag = _lazy_re_compile (r"<(/)?(\S+?)(?:(\s*/)|\s.*?)?>" , re .S )
30
83
re_newlines = _lazy_re_compile (r"\r\n|\r" ) # Used in normalize_newlines
0 commit comments