diff --git a/Lib/html/parser.py b/Lib/html/parser.py index 1e30956fe24f83..ba416e7fa6e3fe 100644 --- a/Lib/html/parser.py +++ b/Lib/html/parser.py @@ -27,6 +27,7 @@ attr_charref = re.compile(r'&(#[0-9]+|#[xX][0-9a-fA-F]+|[a-zA-Z][a-zA-Z0-9]*)[;=]?') starttagopen = re.compile('<[a-zA-Z]') +endtagopen = re.compile('') commentclose = re.compile(r'--\s*>') # Note: @@ -195,7 +196,7 @@ def goahead(self, end): k = self.parse_pi(i) elif startswith("', i + 1) - if k < 0: - k = rawdata.find('<', i + 1) - if k < 0: - k = i + 1 - else: - k += 1 - if self.convert_charrefs and not self.cdata_elem: - self.handle_data(unescape(rawdata[i:k])) + if starttagopen.match(rawdata, i): # < + letter + pass + elif startswith("", [('data', '<>')]) + self._run_check("< >", [('data', '< >')]) + self._run_check("< ", [('data', '< ')]) self._run_check("", []) + self._run_check("<$>", [('data', '<$>')]) self._run_check("", [('comment', '$')]) self._run_check("", [('endtag', 'a')]) + self._run_check("", [('starttag', 'a", [('endtag', 'a'", [('data', "'", []) + self._run_check("", [('starttag', 'a$b', [])]) self._run_check("", [('startendtag', 'a$b', [])]) self._run_check("", [('starttag', 'a$b', [])]) self._run_check("", [('startendtag', 'a$b', [])]) + self._run_check("", [('endtag', 'a$b')]) def test_slashes_in_starttag(self): self._run_check('', [('startendtag', 'a', [('foo', 'var')])]) @@ -576,21 +583,50 @@ def test_EOF_in_charref(self): for html, expected in data: self._run_check(html, expected) - def test_EOF_in_comments_or_decls(self): + def test_eof_in_comments(self): data = [ - ('', [('comment', '-!>')]), + ('' '' '' @@ -604,6 +640,7 @@ def test_bogus_comments(self): '' # required '[' after CDATA ) expected = [ + ('comment', 'ELEMENT br EMPTY'), ('comment', ' not really a comment '), ('comment', ' not a comment either --'), ('comment', ' -- close enough --'), @@ -684,6 +721,26 @@ def test_convert_charrefs_dropped_text(self): ('endtag', 'a'), ('data', ' bar & baz')] ) + @support.requires_resource('cpu') + def test_eof_no_quadratic_complexity(self): + # Each of these examples used to take about an hour. + # Now they take a fraction of a second. + def check(source): + parser = html.parser.HTMLParser() + parser.feed(source) + parser.close() + n = 120_000 + check("