-
-
Notifications
You must be signed in to change notification settings - Fork 32.2k
gh-135462: Fix quadratic complexity in processing special input in HTMLParser #135464
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,6 +5,7 @@ | |
import unittest | ||
|
||
from unittest.mock import patch | ||
from test import support | ||
|
||
|
||
class EventCollector(html.parser.HTMLParser): | ||
|
@@ -430,28 +431,34 @@ def test_tolerant_parsing(self): | |
('data', '<'), | ||
('starttag', 'bc<', [('a', None)]), | ||
('endtag', 'html'), | ||
('data', '\n<img src="URL>'), | ||
('comment', '/img'), | ||
('endtag', 'html<')]) | ||
('data', '\n')]) | ||
|
||
def test_starttag_junk_chars(self): | ||
self._run_check("<", [('data', '<')]) | ||
self._run_check("<>", [('data', '<>')]) | ||
self._run_check("< >", [('data', '< >')]) | ||
self._run_check("< ", [('data', '< ')]) | ||
self._run_check("</>", []) | ||
self._run_check("<$>", [('data', '<$>')]) | ||
self._run_check("</$>", [('comment', '$')]) | ||
self._run_check("</", [('data', '</')]) | ||
self._run_check("</a", [('data', '</a')]) | ||
self._run_check("</a", []) | ||
self._run_check("</ a>", [('endtag', 'a')]) | ||
self._run_check("</ a", [('comment', ' a')]) | ||
self._run_check("<a<a>", [('starttag', 'a<a', [])]) | ||
self._run_check("</a<a>", [('endtag', 'a<a')]) | ||
self._run_check("<!", [('data', '<!')]) | ||
self._run_check("<a", [('data', '<a')]) | ||
self._run_check("<a foo='bar'", [('data', "<a foo='bar'")]) | ||
self._run_check("<a foo='bar", [('data', "<a foo='bar")]) | ||
self._run_check("<a foo='>'", [('data', "<a foo='>'")]) | ||
self._run_check("<a foo='>", [('data', "<a foo='>")]) | ||
self._run_check("<!", [('comment', '')]) | ||
self._run_check("<a", []) | ||
self._run_check("<a foo='bar'", []) | ||
self._run_check("<a foo='bar", []) | ||
self._run_check("<a foo='>'", []) | ||
self._run_check("<a foo='>", []) | ||
self._run_check("<a$>", [('starttag', 'a$', [])]) | ||
self._run_check("<a$b>", [('starttag', 'a$b', [])]) | ||
self._run_check("<a$b/>", [('startendtag', 'a$b', [])]) | ||
self._run_check("<a$b >", [('starttag', 'a$b', [])]) | ||
self._run_check("<a$b />", [('startendtag', 'a$b', [])]) | ||
self._run_check("</a$b>", [('endtag', 'a$b')]) | ||
|
||
def test_slashes_in_starttag(self): | ||
self._run_check('<a foo="var"/>', [('startendtag', 'a', [('foo', 'var')])]) | ||
|
@@ -576,21 +583,50 @@ def test_EOF_in_charref(self): | |
for html, expected in data: | ||
self._run_check(html, expected) | ||
|
||
def test_EOF_in_comments_or_decls(self): | ||
def test_eof_in_comments(self): | ||
data = [ | ||
('<!', [('data', '<!')]), | ||
('<!-', [('data', '<!-')]), | ||
('<!--', [('data', '<!--')]), | ||
('<![', [('data', '<![')]), | ||
('<![CDATA[', [('data', '<![CDATA[')]), | ||
('<![CDATA[x', [('data', '<![CDATA[x')]), | ||
('<!DOCTYPE', [('data', '<!DOCTYPE')]), | ||
('<!DOCTYPE HTML', [('data', '<!DOCTYPE HTML')]), | ||
('<!--', [('comment', '')]), | ||
('<!---', [('comment', '')]), | ||
('<!----', [('comment', '')]), | ||
('<!-----', [('comment', '-')]), | ||
('<!------', [('comment', '--')]), | ||
('<!----!', [('comment', '')]), | ||
('<!---!', [('comment', '-!')]), | ||
('<!---!>', [('comment', '-!>')]), | ||
('<!--foo', [('comment', 'foo')]), | ||
('<!--foo-', [('comment', 'foo')]), | ||
('<!--foo--', [('comment', 'foo')]), | ||
('<!--foo--!', [('comment', 'foo')]), | ||
('<!--<!--', [('comment', '<!')]), | ||
('<!--<!--!', [('comment', '<!')]), | ||
] | ||
for html, expected in data: | ||
self._run_check(html, expected) | ||
|
||
def test_eof_in_declarations(self): | ||
data = [ | ||
('<!', [('comment', '')]), | ||
('<!-', [('comment', '-')]), | ||
('<![', [('comment', '[')]), | ||
('<![CDATA[', [('unknown decl', 'CDATA[')]), | ||
('<![CDATA[x', [('unknown decl', 'CDATA[x')]), | ||
('<![CDATA[x]', [('unknown decl', 'CDATA[x]')]), | ||
('<![CDATA[x]]', [('unknown decl', 'CDATA[x]]')]), | ||
('<!DOCTYPE', [('decl', 'DOCTYPE')]), | ||
('<!DOCTYPE ', [('decl', 'DOCTYPE ')]), | ||
('<!DOCTYPE html', [('decl', 'DOCTYPE html')]), | ||
('<!DOCTYPE html ', [('decl', 'DOCTYPE html ')]), | ||
('<!DOCTYPE html PUBLIC', [('decl', 'DOCTYPE html PUBLIC')]), | ||
('<!DOCTYPE html PUBLIC "foo', [('decl', 'DOCTYPE html PUBLIC "foo')]), | ||
('<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "foo', | ||
[('decl', 'DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "foo')]), | ||
] | ||
for html, expected in data: | ||
self._run_check(html, expected) | ||
|
||
def test_bogus_comments(self): | ||
html = ('<! not really a comment >' | ||
html = ('<!ELEMENT br EMPTY>' | ||
'<! not really a comment >' | ||
'<! not a comment either -->' | ||
'<! -- close enough -->' | ||
'<!><!<-- this was an empty comment>' | ||
|
@@ -604,6 +640,7 @@ def test_bogus_comments(self): | |
'<![CDATA]]>' # required '[' after CDATA | ||
) | ||
expected = [ | ||
('comment', 'ELEMENT br EMPTY'), | ||
('comment', ' not really a comment '), | ||
('comment', ' not a comment either --'), | ||
('comment', ' -- close enough --'), | ||
|
@@ -684,6 +721,26 @@ def test_convert_charrefs_dropped_text(self): | |
('endtag', 'a'), ('data', ' bar & baz')] | ||
) | ||
|
||
@support.requires_resource('cpu') | ||
def test_eof_no_quadratic_complexity(self): | ||
# Each of these examples used to take about an hour. | ||
# Now they take a fraction of a second. | ||
Comment on lines
+724
to
+727
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If they now take a fraction of a second, is there a reason to require the My understanding is that:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. They totally take 1.3 seconds on my computer. All other tests take 0.1-0.2 seconds. It is a waste of time to run it several times for every update of any PR. Some buildbots are slower than my computer. I think that it is enough to run this test only on the fastests builtbots. We already used |
||
def check(source): | ||
parser = html.parser.HTMLParser() | ||
parser.feed(source) | ||
parser.close() | ||
n = 120_000 | ||
check("<a " * n) | ||
check("<a a=" * n) | ||
check("</a " * 14 * n) | ||
check("</a a=" * 11 * n) | ||
check("<!--" * 4 * n) | ||
check("<!" * 60 * n) | ||
check("<?" * 19 * n) | ||
check("</$" * 15 * n) | ||
check("<![CDATA[" * 9 * n) | ||
check("<!doctype" * 35 * n) | ||
|
||
|
||
class AttributesTestCase(TestCaseBase): | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
Fix quadratic complexity in processing specially crafted input in | ||
:class:`html.parser.HTMLParser`. End-of-file errors are now handled according | ||
to the HTML5 specs -- comments and declarations are automatically closed, | ||
tags are ignored. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It seems that now everything after the first
</html>
is ignored (except the\n
). This is technically a change in behavior, which should be fine if the new behavior matches the HTML5 specs, but maybe should be noted in the whatsnew.There also seem to be other minor changes in behavior that -- if they follow the specs -- might not need to be documented (a generic "Some additional invalid constructs are now handled according to the HTML5 specs." might be enough)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In this case, a double-quoted attribute value is never closed. This is https://html.spec.whatwg.org/multipage/parsing.html#parse-error-eof-in-tag .
I have update the NEWS entry.