diff --git a/Lib/html/parser.py b/Lib/html/parser.py index 58f6bb3b1e932d..94f4aaecfc61b3 100644 --- a/Lib/html/parser.py +++ b/Lib/html/parser.py @@ -25,6 +25,7 @@ charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]') starttagopen = re.compile('<[a-zA-Z]') +endtagopen = re.compile('') commentclose = re.compile(r'--\s*>') # Note: @@ -176,7 +177,7 @@ def goahead(self, end): k = self.parse_pi(i) elif startswith("', i + 1) - if k < 0: - k = rawdata.find('<', i + 1) - if k < 0: - k = i + 1 - else: - k += 1 - if self.convert_charrefs and not self.cdata_elem: - self.handle_data(unescape(rawdata[i:k])) + if starttagopen.match(rawdata, i): # < + letter + pass + elif startswith("", [('data', '<>')]) + self._run_check("< >", [('data', '< >')]) + self._run_check("< ", [('data', '< ')]) self._run_check("", []) + self._run_check("<$>", [('data', '<$>')]) self._run_check("", [('comment', '$')]) self._run_check("", [('endtag', 'a')]) + self._run_check("", [('starttag', 'a", [('endtag', 'a'", [('data', "'", []) + self._run_check("", [('starttag', 'a$b', [])]) self._run_check("", [('startendtag', 'a$b', [])]) self._run_check("", [('starttag', 'a$b', [])]) self._run_check("", [('startendtag', 'a$b', [])]) + self._run_check("", [('endtag', 'a$b')]) def test_slashes_in_starttag(self): self._run_check('', [('startendtag', 'a', [('foo', 'var')])]) @@ -537,13 +545,56 @@ def test_EOF_in_charref(self): for html, expected in data: self._run_check(html, expected) - def test_broken_comments(self): - html = ('' + def test_eof_in_comments(self): + data = [ + ('', [('comment', '-!>')]), + ('' '' '' '') expected = [ + ('comment', 'ELEMENT br EMPTY'), ('comment', ' not really a comment '), ('comment', ' not a comment either --'), ('comment', ' -- close enough --'), @@ -598,6 +649,26 @@ def test_convert_charrefs_dropped_text(self): ('endtag', 'a'), ('data', ' bar & baz')] ) + @support.requires_resource('cpu') + def test_eof_no_quadratic_complexity(self): + # Each of these examples used to take about an hour. + # Now they take a fraction of a second. + def check(source): + parser = html.parser.HTMLParser() + parser.feed(source) + parser.close() + n = 120_000 + check("